Pages

SyntaxHighlighter

Friday, June 25, 2021

Creating Sample Data

Sample data plays important roles in testing and benchmarking applications. We need to be mindful of personally identifiable information or PII to not reveal actual user data. SAS does provide procedures such as surveyselect to create extended sample data from existing data but that can be risky if PII is not properly addressed. The use of SAS supplied test data such as SASHELP.CLASS or SASHELP.BASEBALL and the like are just too small to be of any value, especially when benchmarking.

As a result, I have created a macro named sampledata that creates fictious data of any size you like in terms of rows. You can modify the code to your desires to expand the number of columns but this gives you a very good start and explains the use of random numbers when applied to the SAS byte() function.

It is important to understand the American Standard Code for Information Interchange or ASCII character set. As it turns out my very first blog post from 10 years ago was on the creation of the printable ASCII values - see below (click to see a clearer image).

As you can see, numeric values 0 - 9 equate to decimal 48 to 57 in ASCII. If you use the SAS byte() function on ASCII 57 as BYTE(57) it will return a 9. Similarly upper case alphabet characters A - Z are represented as ASCII values 65 - 90. Finally, lowercase a - z are the same as ASCII 97 - 122.

Another critical consideration to create the data is to understand that SAS uses numbers to reprsent dates. SAS dates start with zero (0) on January 1, 1960 and increase or decrease from that base value. You will see a fake date of birth (dob) value has been created for each row. I chose a 60 year time span from 1930 - 1990 based on the SAS date methodology - see below. A span of 30 years in SAS is 365.25 * 30 = 10957 rounded. In order to expand this to a 60 year time period, I used the RAND() function and if the return value was less than 0.5 the value was multiplied by a positive 1 otherwise it was multiplied by a negative 1.

The most important piece of this code is the use of the RAND() function. CALL steaminit() is used to specify a starting seed value used by the RAND() function. If a positive number is used in STREAMINIT() the sequence is repeatable. The RAND() function will return a fractional value between 0 and 1 that you can then multiply by whatever value you like. You can see that in the case of both the firstname and lastname values that each are 16 characters in length so I used byte(int(97 + 26 * rand('uniform')) to generate lower case characters (the range A - Z is 26) and lower case ASCII values start at 97. An outer loop was used to create a name that can be up to 16 characters in length. I set a minimum length of 3 so the loop will be used anywhere between 3 and 16 times to create the names.

The distribution of sex ('F' / 'M') is set at 50.8% female. This ratio was taken from the Census bureau. The SAS IFC() function came in handy here.

I wanted to add some reality to this fake data and did that by using the SASHELP.ZIPCODE data set and augmented it by adding in an incrementing key value (1 to number of rows). This makes it easy to randomly extract a zip code from the hash object. I used the hash object reference of H along with its num_items attribute as h.num_items to return the total number of rows loaded into the hash object. Once a mathc is made (it has to in this scenario) then the associated satellite values are copied into the program data vector (PDV). The value column can be whatever you like but here it is set to a random value between a low side of 30,000 up to 1,000,000 so it could be the value of a house.

The results of running this code via %sampledata() will generate the output shown below. After the image, you will find the entire source code used to create the output.

%macro sampledata(
    rows   = 1000000
  , outdsn = work.sampledata
  , seed   = 123
);

  /* dsn of zip codes with incrementing key */
  data work.zipcodes;
    key + 1;
    set sashelp.zipcode(keep = zip city statecode areacode);
  run;

  data &outdsn(keep = id -- value);
    attrib
      id        length =   8
      firstname length = $16
      lastname  length = $16
      sex       length =  $1
      city      length = $35
      statecode length =  $2
      zip       length =   8
      phone     length = $10
      dob       length =   4 format = mmddyy10.
      age       length =   3
      value     length =   8 format = dollar12.
      areacode  length =   8
    ;

    dcl hash h(dataset: "work.zipcodes");
    h.definekey('key');
    h.definedata('zip', 'city', 'statecode', 'areacode');
    h.definedone();
      
    /* seed for random function */
    call streaminit(&seed);   

    do id = 1 to &rows;
      /* firstname can be between 3 and 16 characters long */
      t_firstlen = max(3, ceil(16 * rand('uniform')));
      do t_i = 1 to t_firstlen;
        if t_i = 1 then firstname = byte(int(65 + 26 * rand('uniform')));
        else substr(firstname, t_i, 1) = byte(int(97 + 26 * rand('uniform')));
      end;
      
      t_lastlen  = max(3, ceil(16 * rand('uniform')));
      do t_i = 1 to t_lastlen;
        if t_i = 1 then lastname = byte(int(65 + 26 * rand('uniform')));
        else substr(lastname, t_i, 1) = byte(int(97 + 26 * rand('uniform')));
      end;
      
      /* Females 50.8% per census:  https://census.gov/quickfacts */
      sex = ifc(rand('uniform') >= 0.508, 'F', 'M');
      
      key = ceil(h.num_items * rand('uniform'));
      if h.find() = 0 then do;
        phone = put(areacode, 3.);
        do t_i = 4 to 10;
          substr(phone, t_i, 1) = byte(int(48 + 10 * rand('uniform')));
        end;
      end;

             /* (365.25 * 30) = 10957 or 30 years of days */
      dob   = ceil(10957 * rand('uniform')) * ifn(rand('uniform') < .5, 1, -1);
      age   = int(yrdif(dob, date(), 'actual'));
      value = max(1000000 * rand('uniform'), 30000);
      output;
    end;

    stop;  /* terminates this data step */
  run;   
%mend;

Thursday, June 3, 2021

Recursive Query with PROC SQL

The SAS implementation of SQL is not as robust as most relational databases. One example where SAS falls short is in the area of recursive queries. I know that SQL Server and PostgreSQL handle recursive queries via the use of common table expressions or CTEs using the WITH statement. Hopefully SAS will add this feature in a future release since CTEs were included in the SQL:1999 standard.

In the mean time, I have emulated an existing article "Learn PostgreSQL Recursive Query By Example". Please reference that article for additional details and to compare the different techniques. Credit must be given to my co-worker and excellent SAS programmer, Dave Devoll for the concept he used recently.

data employees;
  infile datalines dsd;
  input 
    employee_id : 4.
    full_name : $32.
    manager_id : 4.
  ;
  datalines;
1, 'Michael North', .
2, 'Megan Berry', 1
3, 'Sarah Berry', 1
4, 'Zoe Black', 1
5, 'Tim James', 1
6, 'Bella Tucker', 2
7, 'Ryan Metcalfe', 2
8, 'Max Mills', 2
9, 'Benjamin Glover', 2
10, 'Carolyn Henderson', 3
11, 'Nicola Kelly', 3
12, 'Alexandra Climo', 3
13, 'Dominic King', 3
14, 'Leonard Gray', 4
15, 'Eric Rampling', 4
16, 'Piers Paige', 7
17, 'Ryan Henderson', 7
18, 'Frank Tucker', 8
19, 'Nathan Ferguson', 8
20, 'Kevin Rampling', 8 
;
run;

The very short macro function code for %isblank can be found here. The initial query in the below code only returns a single row for employee_id = 2, Megan Berry. After that, a test is made to ensure that at least one row was returned. If not, an error message will be generated and the macro will terminate.

If all is good, the output that was written to the lev0 data set is copied to a data set named results. This is done so it can be used to append subsequent query results. The next SQL statement uses a sub-query to reference back to the prior data set using %eval(&iter -1). If that query returned rows (&sqlobs > 0) then it is append to the results data set and is repeated until no rows are returned.

%macro recursive(id =, iter = );
  %if %isblank(&id) %then %do;
    %put %str(E)RROR: Must pass in a PK value.;
    %return;
  %end;

  %if %isblank(&iter) %then %do;
    proc sql;
      create table lev0 as
        select    employee_id
                , full_name
                , manager_id
        from      employees
        where     employee_id = &id
      ;
    quit;

    %if &sqlobs = 0 %then %do;
      %put %str(E)RROR: No entries found using ID = &id;
      %return;
    %end;

    %if %sysfunc(exist(results)) %then %do;
      /* Clear out old version should it exist */
      proc delete data = results;
      run;
    %end;

    data results;
      set lev0;
    run;

    %let iter = 1;
  %end;

  proc sql;
    create table lev&iter as
      select      employee_id
                , full_name
                , manager_id
      from        employees
      where       manager_id in(
        select    distinct employee_id
        from      lev%eval(&iter - 1))
    ;
  quit;

  %if &sqlobs %then %do;
    proc append
      base = results
      data = lev&iter
    ;
    run;
    
    %recursive(id = &id, iter = %eval(&iter + 1 ));
  %end;

%mend;

%recursive(id = 2);

Monday, December 7, 2020

DOW Sort

Sorting data in ascending or descending order is a very easy and straightforward process.  However, there are times when a simple sort does not suffice.  Consider a scenario of submitting a large number of processes in a SAS grid environment that has a limit on the number of active parallel processes.

In that scenario, you will want to mix tasks that take a long time with those quicker running tasks.  There is normally a key driver that can be used to identify longer running processes.  The number of rows and columns are prime candidates to sort the data.

This example uses the available SASHELP.CLASS data set and sorts it by age and height in ascending order.  The ability to read SAS data sets in random order via the POINT= option in a SET statement make this relatively easy.  The idea is to read the highest or last row with the first X rows that are lower in value.  The technique used is a variant of the DOW Loop created and popularized by SAS Gurus Don Henderson, Paul Dorfman and Ian Whitlock.

The below data set is sorted by age and height.



The following data set shows the result of reading the last row then the top 5 rows.  After that the second to the last row is output followed by rows 6 through 10



/*******************************************************************************

     Program: sort_dow.sas

      Author: Tom Bellmer

     Created: 12/07/2020 @ 1:25:37 PM

SAS Version: SAS 9.4 (TS1M6)

          OS: LIN X64

     Purpose: Places subrows under the highest value via a DOW loop

       Notes: Assumes the input data set is sorted in ascending order

              will be used to stuff low value rows below high for RSUBMITS

              to reduce expected time in each parallel session

 

                             Modifications in descending order

FL-YYYYMMDD                             Description

----------- --------------------------------------------------------------------

 

         1    1    2    2    3    3    4    4    5    5    6    6    7    7    8

....5....0....5....0....5....0....5....0....5....0....5....0....5....0....5....0

*******************************************************************************/

 

%macro sort_dow(

    dsnin   =

  , subrows = 5

  , dsnout  = sort_dow

);

 

  %if not %sysfunc(exist(&dsnin)) %then %do;

    %put %str(E)RROR: Invalid input data set: &dsnin..;

    %return;

  %end;

 

  data &dsnout;

    if 0 then set &dsnin nobs = totobs;

 

    do until(totobs = i);

      set &dsnin point = totobs;

      totobs = totobs - 1;

      output;

      if totobs = i then stop;

 

      do _n_ = 1 to &subrows;

        i + 1;

        set &dsnin point = i;

        output;

        if totobs = i then stop;

      end;

    end;

    stop;

  run;

%mend;

 

/*EOF: sort_dow.sas */

 

 

 

proc sort data = sashelp.class out = class;

  by age height;

run;

 

%sort_dow(

    dsnin   = class

  , subrows = 5

  , dsnout  = sort_dow

);

Monday, October 26, 2020

FREQ N Faster

 


I needed to capture and count the unique value of each character column in SAS data sets.  My first thought was to use PROC FREQ and pass all the character variables to the TABLE statement.

Unfortunately, I quickly ran into an out of memory issue and was unable to specify a memsize setting due to established constraints.  It is a shame to have to call PROC FREQ once for each variable but that approach did get me past the out of memory issue.

The best way to test different approaches was to create some test data.  That is the purpose of the createsampledata macro - source code can be found below.  That code created random upper case characters between ASCII values 65 and 90 (A - Z).  In order to simulate real world data better, the length of each value can fluctuate from 0 to 16 characters.  Instead of making each value unique, proc append is used to copy in existing rows to again better simulate real world data.

I decided to try a hybrid approach to push some, but not all variables to the TABLE statements.  The below code shows the results using the sample test data. Your mileage may vary based on the number of rows processed, character length and the cardinality of the data.

As you can see in the below chart, the sweet spot for this set of data is to use 10 columns in the table statement.  After that the elapsed time increases.  Try utilizing this technique to reduce the amount of time it takes to process your freqn data.



%macro createsampledata(

    rows      = 1000000

  , byvalue   = 1000

);

  %local i;

  proc datasets

    lib = work

    nolist

    kill

  ;

  quit;

 

  data results;

    attrib

      stepvalue length = 3

      duration  length = 8 format = time12.3

    ;

    call missing(of _all_);

    stop;

  run;

   

  /* Create sample data */

  data tempdata(drop = t_:);

    array dummy {*} $16 char_1 - char_100;

    call streaminit(123);  

    

    do t_0 = 1 to &rows by &byvalue;

      do t_1 = 1 to dim(dummy);

        do t_2 = 1 to int(17 * rand('uniform'));

          substr(dummy[t_1], t_2) = byte(int(65 + 26 * rand('uniform')));

        end;

      end;

      output;

    end;

  run; 

  

  /* Not everything is unique so add via byvalue */

  %do i = 1 %to &byvalue;

    proc append

      base = sampledata

      data = tempdata

    ;

    run;

  %end;

%mend;

 

%macro freqnfaster(stepvalue = 1);

  %local i j starttime columncount;

 

  proc sql noprint;

    select      name

    into        :name1 -

    from        dictionary.columns

    where           libname = "WORK"

                and memname = "SAMPLEDATA"

                and type = 'char'

    ;

  quit;

  %let columncount = &sqlobs; 

  

  %let starttime = %sysfunc(datetime());

  %do i = 1 %to &columncount %by &stepvalue;

    proc freq

      data  = work.sampledata

      order = freq

      noprint

    ;

    %do j = &i %to %sysfunc(min(%eval(&i + &stepvalue -1), &columncount));

      table &&name&j / missing out = work.freqout&j;

    %end;

    run;

  %end;

 

  proc sql;

    insert into work.results

      set

          stepvalue = &stepvalue

        , duration  = (datetime() - &starttime)

    ;

  quit;

%mend;

 

 

%createsampledata(

    rows      = 1000000

  , byvalue   = 1000

);

 

%freqnfaster(stepvalue = 1)

%freqnfaster(stepvalue = 2)

%freqnfaster(stepvalue = 3)

%freqnfaster(stepvalue = 5)

%freqnfaster(stepvalue = 8)

%freqnfaster(stepvalue = 10)

%freqnfaster(stepvalue = 15)

%freqnfaster(stepvalue = 20)

%freqnfaster(stepvalue = 25)

%freqnfaster(stepvalue = 30)

%freqnfaster(stepvalue = 50)

%freqnfaster(stepvalue = 100)

 

ods _all_ close;

ods listing gpath = "/myfolder";

ods graphics / reset = index imagename = 'freqnfast';

title 'PROC FREQ Results of 1M rows and 100 columns';

proc sgplot data = results;

  hbar stepvalue /

    response = duration

    stat = mean

    datalabel

  ;

  xaxis grid label = 'Duration';

  yaxis grid label = 'Step Value';

run;

Tuesday, August 11, 2020

find text in a file

 

Over time we tend to collect a large number of program files.  It never fails that after much time has past that you need to access some code in a file but can't recall which program the keyword was in.

I work on a Linux operating system so I can use the GREP command to handle the task but I never remember the syntax options and do not like the way the data is returned.  That is why I wrote a macro named %findtext() that handles this for you in SAS via the use of a PIPE command.

If want to work for the keyword 'bitwise' in all SAS programs in a folder and any sub-folders that GREP command will be as follows:

> grep -nRi 'updatestagingdetails' '/em_data1/prod/macro' --include=*.sas

/em_data1/prod/macro/etl.sas:524:       %updatestagingdetails(env_out=&env_out.);

/em_data1/prod/macro/updatestagingdetails.sas:2:*     Program: updatestagingdetails.sas

/em_data1/prod/macro/updatestagingdetails.sas:9:*       Usage: %updatestagingdetails(dsn=work.stagingdetails,env_out=%env())

/em_data1/prod/macro/updatestagingdetails.sas:21:%macro updatestagingdetails( dsn  = work.stagingdetails, env_out = %env() ) ;

/em_data1/prod/macro/updatestagingdetails.sas:40:/*EOF: updatestagingdetails.sas */

Notice in the above output that the values are separated by a colon (:) and it is overall hard to read in my opinion.  This is where the SAS SCAN and FIND functions come into play to create a cleaner easier to read result.

The SAS macro call will be as follows (only the first two parameters are required):

%findtext(

  path      = /em_data1/prod/macro

 , text     = updatestagingdetails 

  , ignorecase   = Y

  , exactmatch   = N

  , extension    = sas

  , outdsn       = findtextresults

  , printresults = Y

);


 

/*******************************************************************************

     Program: findtext.sas

      Author: Tom Bellmer

Responsible: Tom Bellmer

     Created: 08/10/2020 @ 3:00:45 PM

SAS Version: SAS 9.4 (TS1M6)

          OS: LIN X64

     Purpose: read the contents of files searching for the string value

       Usage: %findtext(path = /folder, string = sql);

       Notes: colon is used as output separator.  Scan() function had issues

              with third column that contained colons.

  Parameters: path = (folder path to be searched)

              text = (text to be searched inside each file)

              ignorecase = Y/N to ignore the case of the &string.

              exactmatch = N/Y to match whole &string value

              extension = sas (file extension, use * for all files)

              outdsn = findtextresults (output data set name)

              printresults = Y (proc print the results? Y/N)

 

                             Modifications in descending order

FL-YYYYMMDD                             Description

----------- --------------------------------------------------------------------

 

         1    1    2    2    3    3    4    4    5    5    6    6    7    7    8

....5....0....5....0....5....0....5....0....5....0....5....0....5....0....5....0

*******************************************************************************/

 

%macro findtext(

    path         =

  , text         =

  , ignorecase   = Y

  , exactmatch   = N

  , extension    = sas

  , outdsn       = findtextresults

  , printresults = Y

);

 

  %local options i;

  %do i = 1 %to 100;

    %local pathname&i;

  %end;

 

  %if %isblank(&path) %then %do;

    %put %str(E)RROR: Must pass in a value for path.;

    %return;

  %end;

 

  %if %isblank(&text) %then %do;

    %put %str(E)RROR: Must pass in a value for text.;

    %return;

  %end;

 

  /*   i = ignores case, n = show matched line number

     , R = recursively,  w = match whole word  */

  %let options = -nR;

  %if %upcase(&ignorecase) = Y %then %let options = &options.i;

  %if %upcase(&exactmatch) = Y %then %let options = &options.w;

 

  filename search pipe "grep &options '&text' '&path' --include='*.&extension'";

  data &outdsn(drop = fullfilename findpos);

    attrib

      Path         length = $256 label = 'Path'

      Filename     length = $128 label = 'Filename'

      Lineno       length = 8    label = 'Line No'

      Code         length = $256 label = 'Code'

      Fullfilename length = $256 label = 'Full Filename'

    ;

 

    infile search;

    input;

    fullfilename = scan(_infile_, 1':');

    filename     = scan(fullfilename, -1"/");

    path         = substr(_infile_, 1, lengthn(fullfilename) - lengthn(filename));

    lineno       = input(scan(_infile_, 2':'), 8.);

    findpos      = find(_infile_, ":");

    findpos      = find(_infile_, ":", findpos + 1);

    code         = substr(_infile_, findpos + 1);

  run;

  filename search clear;

 

  %if %getattr(dsn = &outdsn, attr = nlobs) = 0 %then %do;

    proc sql;

      insert into &outdsn

        set path     = "&path"

          , filename = "N/A"

          , lineno   = 0

          , code     = "No results were found using '&text'"

      ;

    quit;

  %end;

 

  %if %upcase(%substr(&printresults, 11)) = Y %then %do;

    proc sql noprint;

      select   distinct path

      into     :pathname1 -

      from     &outdsn;

    quit;

   

    title "Search results for: &text";

    %if &sqlobs = 1 %then %do;

      title2 "In Path: &pathname1";

    %end;

    proc print data = &outdsn;

      %if &sqlobs = 1 %then %do;

         var filename lineno code;

      %end;

    run;

    title;

  %end;

%mend;

 

/*EOF: findtext.sas */