I needed to capture and count the unique value of each character column in SAS data sets. My first thought was to use PROC FREQ and pass all the character variables to the TABLE statement.
Unfortunately, I quickly ran into an out of memory issue and was unable to specify a memsize setting due to established constraints. It is a shame to have to call PROC FREQ once for each variable but that approach did get me past the out of memory issue.
The best way to test different approaches was to create some test data. That is the purpose of the createsampledata macro - source code can be found below. That code created random upper case characters between ASCII values 65 and 90 (A - Z). In order to simulate real world data better, the length of each value can fluctuate from 0 to 16 characters. Instead of making each value unique, proc append is used to copy in existing rows to again better simulate real world data.
I decided to try a hybrid approach to push some, but not all variables to the TABLE statements. The below code shows the results using the sample test data. Your mileage may vary based on the number of rows processed, character length and the cardinality of the data.
As you can see in the below chart, the sweet spot for this set of data is to use 10 columns in the table statement. After that the elapsed time increases. Try utilizing this technique to reduce the amount of time it takes to process your freqn data.
%macro createsampledata(
rows = 1000000
, byvalue = 1000
);
%local i;
proc datasets
lib = work
nolist
kill
;
quit;
data results;
attrib
stepvalue length = 3
duration length = 8 format = time12.3
;
call missing(of _all_);
stop;
run;
/* Create sample data */
data tempdata(drop = t_:);
array dummy {*} $16 char_1 - char_100;
call streaminit(123);
do t_0 = 1 to &rows by &byvalue;
do t_1 = 1 to dim(dummy);
do t_2 = 1 to int(17 * rand('uniform'));
substr(dummy[t_1], t_2) = byte(int(65 + 26 * rand('uniform')));
end;
end;
output;
end;
run;
/* Not everything is unique so add via byvalue */
%do i = 1 %to &byvalue;
proc append
base = sampledata
data = tempdata
;
run;
%end;
%mend;
%macro freqnfaster(stepvalue = 1);
%local i j starttime columncount;
proc sql noprint;
select name
into :name1 -
from dictionary.columns
where libname = "WORK"
and memname = "SAMPLEDATA"
and type = 'char'
;
quit;
%let columncount = &sqlobs;
%let starttime = %sysfunc(datetime());
%do i = 1 %to &columncount %by &stepvalue;
proc freq
data = work.sampledata
order = freq
noprint
;
%do j = &i %to %sysfunc(min(%eval(&i + &stepvalue -1), &columncount));
table &&name&j / missing out = work.freqout&j;
%end;
run;
%end;
proc sql;
insert into work.results
set
stepvalue = &stepvalue
, duration = (datetime() - &starttime)
;
quit;
%mend;
%createsampledata(
rows = 1000000
, byvalue = 1000
);
%freqnfaster(stepvalue = 1)
%freqnfaster(stepvalue = 2)
%freqnfaster(stepvalue = 3)
%freqnfaster(stepvalue = 5)
%freqnfaster(stepvalue = 8)
%freqnfaster(stepvalue = 10)
%freqnfaster(stepvalue = 15)
%freqnfaster(stepvalue = 20)
%freqnfaster(stepvalue = 25)
%freqnfaster(stepvalue = 30)
%freqnfaster(stepvalue = 50)
%freqnfaster(stepvalue = 100)
ods _all_ close;
ods listing gpath = "/myfolder";
ods graphics / reset = index imagename = 'freqnfast';
title 'PROC FREQ Results of 1M rows and 100 columns';
proc sgplot data = results;
hbar stepvalue /
response = duration
stat = mean
datalabel
;
xaxis grid label = 'Duration';
yaxis grid label = 'Step Value';
run;
No comments:
Post a Comment