前面我们介绍了数据的理解和数据预处理的变量处理,今天我们将介绍观测的处理。
图4-1 数据预处理流程
3.1 筛选观测
观测的筛选可以通过DATA步的WHERE语句和数据集选项WHERE=、IF语句、DELETE语句、PROC SQL等方式来实现。
*===数据集选项WHERE=从输入数据集筛选观测;
DATA weight(drop=black); /*临时数据集weight删除变量black*/
SET sashelp.Bweight(where=(black=0));/*变量black=0的观测输出到临时数据集weight*/
RUN;
*===数据集选项WHERE=从输入数据集筛选观测;
DATA weight(drop=black); /*临时数据集weight删除变量black*/
SET sashelp.Bweight;/*变量black=0的观测输出到临时数据集weight*/
WHERE black=0;
RUN;
*===数据集选项WHERE=从输出数据集筛选观测报错;
DATA weight1(where=(black=0));
SET sashelp.Bweight;
RUN;
图4-2 数据集选项读入数据的报错结果
*===数据集选项WHERE=从输出数据集筛选观测;
DATA weight1(where=(black=0) drop=black);
SET sashelp.Bweight;
RUN;
图4-3 数据集选项输出数据的报错结果
*===数据集选项WHERE=与DROP语句/DROP=筛选数据;
DATA weight1(where=(black=0));
SET sashelp.Bweight;
DROP black;
RUN;
图4-4 WHERE语句筛选数据的报错结果
在数据整理(一)中我们介绍了KEEP/DROP语句和数据集选项KEEP=/DROP=的应用,实际中对数据进行预处理时,变量和观测的筛选通常是组合应用的。KEEP/DROP语句、数据集选项KEEP=/ DROP=、RENAME语句/数据集选项RENAME=、WHERE语句/数据集选项WHERE=在应用时,是要遵循执行规则的(图4-2)。以下面的三段程序来进一步说明在数据集输、输入阶段和SAS语句处理阶段的执行顺序。
图4-5 SAS语句(或数据集选项)执行顺序
*===数据集输入阶段WHERE=/DROP=/RENAME=选项筛选观测;
DATA weight;
SET sashelp.Bweight(WHERE=(black=0 and education in (0,10)) rename=(ed=education) keep=weight black married boy ed);
/*KEEP=最先执行,RENAME=在KEEP=之后执行,WHERE最后执行*/
RUN;
*===数据集输出阶段WHERE=/DROP=/RENAME=选项筛选观测;
DATA weight(WHERE=(black=0 and education in (0,10)) rename=(ed=education) keep=weight black married boy ed);
/*KEEP=最先执行,RENAME=在KEEP=之后执行,WHERE最后执行*/
SET sashelp.Bweight;
RUN;
*===WHERE/DROP/RENAME语句筛选观测;
DATA weight;
SET sashelp.Bweight;
RENAME ed=education;/*将变量名ed修改为education,最后执行*/
DROP black;/*数据集weight1中删除变量black,在WHERE之后、RENAME之前执行*/
WHERE black=0 and ed in (0,1);/*筛选变量black=0且ed=0或ed=1的观测,最先执行*/
RUN;
*===WHERE语句中SAME-AND筛选变量;
PROC GPLOT data=years;
PLOT unit*quar=year;
RUN;
WHERE year > '01jan2009'd;
RUN;
WHERE same and year < '01jan2012'd;
RUN;
QUIT;
在WHERE语句中可以使用一些SAS算符和特殊的WHERE表达式算符(详见4-1),其中应用技巧需要在实践中不断提高。
表4-1 WHERE语句操作符
*===IF语句筛选观测;
DATA weight;
SET sashelp.Bweight;
IF black=0;/*输出符合条件的观测*/
RUN;
*===PROC SQL筛选观测;
PROC SQL OUTOBS=100;
CREATE TABLE weight AS
SELECT weight, black, married, ed
FROM sashelp.Bweight;
QUIT;
3.2 删除观测
*===REMOVE语句删除;
DATA food;
INPUT name $ water energy protein fat cho @@;
DATALINES;
黄豆 10.2 390 35.0 16.0 34.2
黑豆 9.9 401 36.0 15.9 33.6
青豆 9.5 398 34.5 16.0 35.4
豆腐 78.6 116 9.2 8.1 3.0
豆汁 97.4 10 0.9 0.1 1.4
豆浆 93.8 31 3.0 1.6 1.2
腐竹 7.9 461 44.6 21.7 22.3
;
RUN;
DATA food1;
MODIFY food1;
IF name="黄豆" THEN REMOVE;
RUN;
*===IF语句删除观测;
DATA weight;
SET sashelp.Bweight;
IF black ne 1;/*删除观测*/
RUN;
*===IF-THEN DELETE语句删除观测;
DATA food2;
SET food1;
IF name eq '腐竹' THEN DELETE;/*删除观测*/
RUN;
*===PROC SQL删除观测;
PROC SQL;
DELETE
FROM food1
WHERE name like '豆%';
QUIT;
*====PROC SOTR程序删除重复观测;
*===创建删除重复变量的数据集;
PROC IMPORT OUT=classfit
DATAFILE="E:study fileJindingtongjiSASclass.xls"
DBMS=excel
REPLACE;
RUN;
*===RPOC SORT删除重复观测;
PROC SORT data=classfit
OUT=class
DUPOUT=class_dup
NODUPKEY;
BY name sex;
RUN;
3.3 修改观测
*===REPLACE替换观测;
DATA master;
INPUT FirstName $ id $ PhoneNumber;
DATALINES;
Kevin ABCjkh 904
Sandi defsns 905
Terry ghitDP 951
Jason jklJWM 962
;
DATA trans;
INPUT FirstName $ id $ PhoneNumber;
DATALINES;
. ABCjkh 2904
. defsns 2905
Madeline mnombt 2983
;
DATA master;
MODIFY master trans;/* 用trans数据集中的信息修改master数据集 */
BY id; /* id排序*/
IF _iorc_ = %sysrc(_sok) THEN REPLACE; /* 在master中找到观测替换*/
/* obs not in master */
ELSE IF _iorc_ = %sysrc(_dsenmr) THEN DO;/*在主数据集中没能找到与修改数据相应的观测*/
_error_=0;
_iorc_=0;
OUTPUT;
END;
RUN;
图4-6 用trans数据集信息修改master数据集结果
*===SET修改观测;
DATA food1;
SET food;
energy=energy*4.18;
RUN;
*===MODIFY修改观测;
DATA food;
MODIFY food;
energy=energy*4.18;
RUN;
*===PROC SQL修改观测;
PROC SQL food;
UPDATE food
SET energy=energy*4.18;
QUIT;
3.4 累加观测
*===IF语句分组累加;
DATA weight;
SET sashelp.Bweight;
RUN;
PROC SORT data=weight;
by ed;/*根据变量ed进行排序*/
RUN;
DATA weight1(KEEP=ed cum_weight cnt_weight avg_weight);
SET weight end=last;
BY ed;
IF first.ed THEN DO;/*观测为分组变量的第一条时就执行累加*/
cum_weight=0;
cnt_weight=0;
END;
cum_weight+weight;
cnt_weight+1;
IF last.ed THEN DO;/*观测为分组变量的最后一条计算变量weight的算术平均值*/
avg_weight=cum_weight/cnt_weight;
OUTPUT;
END;
RUN;
*===PROC SQL分组累加;
PROC SQL;
CREATE TABLE weight2 AS
SELECT ed, sum(weight) AS cum_weight, count(weight) AS cnt_weight,
avg(weight) AS avg_weight
FROM sashelp.Bweight
GROUP BY ed;
QUIT;
表4-2 DATA步常用语句
参考文献:
[1] 谷鸿秋. SAS编程演义[M]. 北京:清华大学出版社,2017.
[2] 高惠璇. SAS系统Base SAS软件使用手册[M]. 北京:中国统计出版社,1997.
[3] https://support.sas.com/en/software/base-sas-support.html.
[4] 夏庄坤, 徐唯, 潘红莲, 等. 深入解析SAS——数据处理、分析优化与商业应用[M]. 北京: 机械工业出版社,2014.
[5] 姚志勇. SAS编程与数据挖掘商业案例[M]. 北京: 机构工业出版社, 2010.
---SAS系列---
感谢点亮再看!
联系客服