Pages

SyntaxHighlighter

Sunday, August 28, 2022

A Robust Histogram

I wanted to create a robust histogram to display the response time of a web site. To make a histogram you need to create your own statgraph using the Graphic Template Language (GTL). This requires a lot of research to see what is needed to make it all happen. Below is an image of the end result.

I received some excellent assistance from Lelia at SAS technical support to add the percent sign (%) on the Y axis. This was accomplished by creating a custom format then assigning it to tickvalueformat= option inside linearopts=().

You will see within the proc template code that I am using a vertical reference line to denote the 2.5 second boundary. Each of the bars cover an area between 0.5 above and below the center point. For example, the value 2 has a range of 1.5 to 2.5.

A bunch of summary statistics are handled in GTL by using the eval() function. Below the histogram is a fringeplot that shows the activity of hits by time period. A normal distribution curve was added using the densityplot syntax. Under the fringeplot is a horizontal boxplot that shows the spread of the data in a different fashion than the histogram.

The frequency procedure was used to obtain the percentages for each bar in the display. Here is the SAS data set that was created from the proc freq call.

The final piece I wanted was to display the actual bin values above the bars in the histogram. There are no options to do this using GTL but SAS does support the inclusion of an annotated (sganno) data set. With help from SAS tech support I was able to accomplish that last wish list item resulting in what I call a very robust histogram.

I used a random function (call streaminit with rand()) to create simulated data and suppressed values that exceeded 1.75. The rounded duration value was used with proc freq to drive the annoated data set to populate the values above the bars and the reference line.

The proc template code is very much reusable and is used by proc sgrender to create the output. You will notice that I am writing to the WORK folder via ODS HTML5.

proc format;
   picture mypct (round)
   low - high='009%';
run;

proc template;
  define statgraph distribution;
    dynamic
      VAR
      VARLABEL
      TITLE
      NORMAL
      fmt
    ;
    mvar pct2secs;

    begingraph;
      entrytitle TITLE;
      layout lattice /
        columns         = 1
        rows            = 2
        rowgutter       = 2px
        rowweights      = (.9 .1)
        columndatarange = union
      ;

        columnaxes;
          columnaxis /
            label      = VARLABEL
            display    = (ticks tickvalues label)
            type       = linear
          ;
        endcolumnaxes;    

        layout overlay /
          yaxisopts = (
            offsetmin   = .035
            offsetmax   = .065
            griddisplay = on
            linearopts = (tickvalueformat = mypct4.)
          )

          xaxisopts = (label = "Duration in Seconds");
          referenceline x = 2.5 /
            lineattrs          = (color = red pattern = dash) 
            curvelabel         = pct2secs
            curvelabellocation = outside
          ;

          layout gridded /
            columns   = 2
            border    = true
            autoalign = (topleft topright)
          ;
            entry halign = left "Nobs";
            entry halign = right eval(strip(put(n(VAR), comma12.)));
            entry halign = left "Min";
            entry halign = right eval(strip(putn(min(VAR), '12.3')));
            entry halign = left "Q1";
            entry halign = right eval(strip(putn(q1(VAR), '12.3')));
            entry halign = left "Median";
            entry halign = right eval(strip(putn(median(VAR), '12.3')));
            entry halign = left "Mean";
            entry halign = right eval(strip(putn(mean(VAR), '12.3')));
            entry halign = left "Q3";
            entry halign = right eval(strip(putn(q3(VAR), '12.3')));
            entry halign = left "Max";
            entry halign = right eval(strip(putn(max(VAR), '12.3')));
            entry halign = left "StdDev";
            entry halign = right eval(strip(putn(stddev(VAR), '12.3')));
            entry halign = left "IQR";
            entry halign = right eval(strip(putn(qrange(VAR), '12.3')));
          endlayout;
 
          histogram VAR /
            scale     = percent
            binwidth  = 1
            dataskin  = gloss
          ;
          annotate / id='label';
 
          if (exists(NORMAL))
            densityplot VAR /
              normal()
              name        = 'norm'
              legendlabel = 'Normal'
            ;
          endif;

          fringeplot VAR / datatransparency = .7;

          discretelegend "norm" "kern" /
            location  = inside
            across    = 1
            autoalign = (topright topleft)
            opaque    = true
          ;
        endlayout;

         boxplot y = VAR / orient = horizontal;
     endlayout;
   endgraph;
  end;
run;

 
data hitsinseconds;
  call streaminit(123);       /* set random number seed */
  do _n_ = 1 to 2000;
    duration = round(7 * rand("Uniform"), .001);
    if duration > 1.75 then do;
      if rand("Uniform") > .05 then continue;
    end;
    Seconds = round(duration, 1);
    output;
  end;
run;

proc freq data = hitsinseconds noprint;
  table seconds / out = hitfreq outcum;
run;

data sganno;
  retain id 'label';
  set hitfreq;
  drawspace = 'datavalue';
  function  = 'text';
  x1        = seconds;
  y1        = percent;
  label     = cats(put(percent, 7.2), '%');
  textcolor = 'black';
  textsize  = 8;
  anchor    = 'bottom';
  if seconds = 2 then call symputx('pct2secs', cats(put(cum_pct, 8.), '%'));
run;

ods listing close;
ods graphics on / width = 640px height = 480px;
ods html5 
  path = "%sysfunc(pathname(work))" 
  file = "histogram.html" 
  style = seaside
;

  title;
  proc sgrender
    data     = hitsinseconds
    template = distribution
    sganno   = sganno
  ;

    dynamic
      var      = "duration"
      varlabel = "Time in Seconds"
      normal   = "yes"
      title    = "&pct2secs of hits were under 2.5 seconds"
    ;
    
    format duration 8.1;
  run;
ods html close; 

Monday, June 20, 2022

When a problem comes along, you must whip it!

While I have used the SAS FIND() function frequently, that is not the case for the similarly named FINDW() (find word) function. Taking advice from the 1980s band Devo, a problem came along and I was ready to whip it, whip it good!

A pangram is a unique sentence that contains all the leters of the alphabet at least once. The name comes from the Greek root words pan, meaning "all," and gram, meaning "something written or recorded". The best-known English pangram is "The quick brown fox jumps over the lazy dog" and that is what I will use to test the issue.

The FIND() function returns the starting position of the substring of characters so it can find both Quick and Qui. It is important to note that the 3rd parameter in FIND that uses an 'i' modifier to ignore the case of the characters - upper and lower are treated the same.

Now that we know how the FIND() function works, you would think that FINDW() would do the same - that is the 3rd parameter just needs a 'i' to ignore the case and it will work on full words not a substring. That is not the case for FINDW(), see the differnce between a_findw and a_findw2 that can find the word 'Quick'.

There are times I want an exact match vs a substring match and now I can see the differences - problem whiped - err solved.

data test; 
  str = "The quick brown fox jumps over the lazy dog";
  a_find = find(str, "Quick", 'i');          /* found at position 5 */
  a_findw = findw(str, "Quick", 'i');        /* not found */
  a_findw2 = findw(str, "Quick", ' ', 'i');  /* found at position 5 */
 
  b_find = find(str, 'Qui', 'i');            /* found at position 5 */
  b_findw2 = findw(str, 'Qui', ' ', 'i');    /* not found - no word named Qui */
run;

Should you think this post has gone to the dogs, please consider just as in SAS there are many ways to solve a problem there too are alternate ways to whip it, whippet good!

Sunday, May 15, 2022

If only I had an Array

Modeling data sets often have binary flag variable to indicate if a condition is true or false. Sometimes those indicators need to be collapsed or recoded into a single value based on conditions. See the below code for one technique to handle this.
       if s_del30postscratch_ind  = 1 then curr_delinquency = "D30";
  else if s_del60postscratch_ind  = 1 then curr_delinquency = "D60";
  else if s_del90postscratch_ind  = 1 then curr_delinquency = "D90";
  else if s_del120postscratch_ind = 1 then curr_delinquency = "D120";
  else if s_del150postscratch_ind = 1 then curr_delinquency = "D150";
  else if s_del180postscratch_ind = 1 then curr_delinquency = "D180+";
  else curr_delinquency = "Current";
An alternative way to do the same thing in SAS using two arrays is as follows:
  array avars s_del30postscratch_ind s_del60postscratch_ind s_del90postscratch_ind 
              s_del120postscratch_ind s_del180postscratch_ind;
  array anames[5] $8 _temporary_ ('D30', 'D60', 'D90', 'D120', 'D180+');
  
  do _n_ = 1 to dim(avars);
    if avars[_n_] = 1 then curr_delinquency = anames[_n_];
  end;
  if sum(of avars[*]) = 0 then curr_delinquency = 'Current';
Here is the entire array technique complete with some sample data. The temporary array has the same number of elements so the relative offset matches and makes this a better, more eloquent technique that can be expanded to easily support more if statements if that is what you encounter. Notice the use of the double dash (--) to specify the start and stop columns to process that can be used as a shortcut. Just a different way to do the same thing.
data x;
  length curr_delinquency $8;
  input s_del30postscratch_ind s_del60postscratch_ind s_del90postscratch_ind 
        s_del120postscratch_ind s_del180postscratch_ind;
  array avars s_del30postscratch_ind -- s_del180postscratch_ind;
  array anames[5] $8 _temporary_ ('D30', 'D60', 'D90', 'D120', 'D180+');
  
  do _n_ = 1 to dim(avars);
    if avars[_n_] = 1 then curr_delinquency = anames[_n_];
  end;
  if sum(of avars[*]) = 0 then curr_delinquency = 'Current';
  datalines;
  1 0 0 0 0
  0 1 0 0 0
  0 0 1 0 0
  0 0 0 1 0
  0 0 0 0 1
  1 1 1 1 1 
  0 0 0 0 0
  ;
run;  

Friday, August 27, 2021

Last Column of DSN

You can use the double dash ( -- ) shorthand to drop a list of columns in a data set based on names. I needed to do this for a list of data sets starting at a certain column to the very last column, but the last column name changed between data sets.

The below macro utilizes low level SAS functions to return the name of the last column. So now I can do something like this:

data x;
  set sashelp.class(drop = age -- %lastcolumn(sashelp.class));
run;


%macro lastcolumn(dsn);
  %local dsid retval;

  %let dsid = %sysfunc(open(&dsn));
  %if &dsid %then %do;
     %let retval = %sysfunc(varname(&dsid, %sysfunc(attrn(&dsid, nvars))));
     %let dsid = %sysfunc(close(&dsid));
  %end;

  &retval
%mend;

Thursday, August 19, 2021

My Snippets - SQL Snapshot

Now that SAS Studio is becomming more prevalant, I am moving away from Enterprise Guide into SAS Studio. A nice feature of SAS Studio is the ability to add your own code snippets.

While working on a project, I often find myself having to trace thru a multitude of data sets and track the number of rows and columns at each step. I typically keep all those data sets in the WORK library. I have created the below code snippet to support that type of activity.

The steps to save this snippet are to write the SQL statement as shown below then either right mouse click and select "Add to My Snippets" or click on the icon circled in red. Provide a name then click the Save button. Now you can easily recall that code snippet by clicking on Snippets, My Snippets, snap (short for snapshot).

Below is an example output from running the snap snippet. Notice that the output is sorted by the last modified date so it follows the flow of my program.

Thursday, July 15, 2021

Free Beer

Ok, so maybe the title got you a bit excited but how do you feel about free SODA as in SAS OnDemand for Academics? SAS OnDemand for Academics is a free cloud based version of SAS that uses SAS Studio in your web browser as an editor. This means that there is no software to download and install and that with your credentials you can use SAS anywhere you have an internet connection.

To get started you just need to create a SAS profile for a free account with SAS on this page. If you already have a profile you can use your email address and password to logon. Otherwise, click on the Don't have a SAS Profile link shown in the below image. You will need to verify that your email is valid by acknowledging a verfication email.

After you have logged in, you will see a screen similar to the image below. Notice in the lower right that there is a progress bar used to display how much of your 5GB of free disk space you have used. To get started, click on the SAS Studio link circled in red below.

Once you start SAS Studio, you will see an image similar to that shown below. There is way too much to cover with a blank editor so suggest users look at the SAS Studio documentation for more information.

One quick example is to right click on Files (Home) under Server Files and Folders and create a new folder (I called mine data). Next click on that new folder then on the Upload icon (or right mouse click, Upload Files...). Now that your file is in the cloud, right mouse click and select Import Data. Follow the wizard and if that was an Excel file the data just got loaded into a SAS data set in the cloud. From there do whatever analysis you desire usign the familiar SAS syntax.

The important thing to keep in mind is that all your code and data will live in the cloud there is nothing on your own computer. Hope you enjoyed this post and drink up!

Friday, June 25, 2021

Creating Sample Data

Sample data plays important roles in testing and benchmarking applications. We need to be mindful of personally identifiable information or PII to not reveal actual user data. SAS does provide procedures such as surveyselect to create extended sample data from existing data but that can be risky if PII is not properly addressed. The use of SAS supplied test data such as SASHELP.CLASS or SASHELP.BASEBALL and the like are just too small to be of any value, especially when benchmarking.

As a result, I have created a macro named sampledata that creates fictious data of any size you like in terms of rows. You can modify the code to your desires to expand the number of columns but this gives you a very good start and explains the use of random numbers when applied to the SAS byte() function.

It is important to understand the American Standard Code for Information Interchange or ASCII character set. As it turns out my very first blog post from 10 years ago was on the creation of the printable ASCII values - see below (click to see a clearer image).

As you can see, numeric values 0 - 9 equate to decimal 48 to 57 in ASCII. If you use the SAS byte() function on ASCII 57 as BYTE(57) it will return a 9. Similarly upper case alphabet characters A - Z are represented as ASCII values 65 - 90. Finally, lowercase a - z are the same as ASCII 97 - 122.

Another critical consideration to create the data is to understand that SAS uses numbers to reprsent dates. SAS dates start with zero (0) on January 1, 1960 and increase or decrease from that base value. You will see a fake date of birth (dob) value has been created for each row. I chose a 60 year time span from 1930 - 1990 based on the SAS date methodology - see below. A span of 30 years in SAS is 365.25 * 30 = 10957 rounded. In order to expand this to a 60 year time period, I used the RAND() function and if the return value was less than 0.5 the value was multiplied by a positive 1 otherwise it was multiplied by a negative 1.

The most important piece of this code is the use of the RAND() function. CALL steaminit() is used to specify a starting seed value used by the RAND() function. If a positive number is used in STREAMINIT() the sequence is repeatable. The RAND() function will return a fractional value between 0 and 1 that you can then multiply by whatever value you like. You can see that in the case of both the firstname and lastname values that each are 16 characters in length so I used byte(int(97 + 26 * rand('uniform')) to generate lower case characters (the range A - Z is 26) and lower case ASCII values start at 97. An outer loop was used to create a name that can be up to 16 characters in length. I set a minimum length of 3 so the loop will be used anywhere between 3 and 16 times to create the names.

The distribution of sex ('F' / 'M') is set at 50.8% female. This ratio was taken from the Census bureau. The SAS IFC() function came in handy here.

I wanted to add some reality to this fake data and did that by using the SASHELP.ZIPCODE data set and augmented it by adding in an incrementing key value (1 to number of rows). This makes it easy to randomly extract a zip code from the hash object. I used the hash object reference of H along with its num_items attribute as h.num_items to return the total number of rows loaded into the hash object. Once a mathc is made (it has to in this scenario) then the associated satellite values are copied into the program data vector (PDV). The value column can be whatever you like but here it is set to a random value between a low side of 30,000 up to 1,000,000 so it could be the value of a house.

The results of running this code via %sampledata() will generate the output shown below. After the image, you will find the entire source code used to create the output.

%macro sampledata(
    rows   = 1000000
  , outdsn = work.sampledata
  , seed   = 123
);

  /* dsn of zip codes with incrementing key */
  data work.zipcodes;
    key + 1;
    set sashelp.zipcode(keep = zip city statecode areacode);
  run;

  data &outdsn(keep = id -- value);
    attrib
      id        length =   8
      firstname length = $16
      lastname  length = $16
      sex       length =  $1
      city      length = $35
      statecode length =  $2
      zip       length =   8
      phone     length = $10
      dob       length =   4 format = mmddyy10.
      age       length =   3
      value     length =   8 format = dollar12.
      areacode  length =   8
    ;

    dcl hash h(dataset: "work.zipcodes");
    h.definekey('key');
    h.definedata('zip', 'city', 'statecode', 'areacode');
    h.definedone();
      
    /* seed for random function */
    call streaminit(&seed);   

    do id = 1 to &rows;
      /* firstname can be between 3 and 16 characters long */
      t_firstlen = max(3, ceil(16 * rand('uniform')));
      do t_i = 1 to t_firstlen;
        if t_i = 1 then firstname = byte(int(65 + 26 * rand('uniform')));
        else substr(firstname, t_i, 1) = byte(int(97 + 26 * rand('uniform')));
      end;
      
      t_lastlen  = max(3, ceil(16 * rand('uniform')));
      do t_i = 1 to t_lastlen;
        if t_i = 1 then lastname = byte(int(65 + 26 * rand('uniform')));
        else substr(lastname, t_i, 1) = byte(int(97 + 26 * rand('uniform')));
      end;
      
      /* Females 50.8% per census:  https://census.gov/quickfacts */
      sex = ifc(rand('uniform') >= 0.508, 'F', 'M');
      
      key = ceil(h.num_items * rand('uniform'));
      if h.find() = 0 then do;
        phone = put(areacode, 3.);
        do t_i = 4 to 10;
          substr(phone, t_i, 1) = byte(int(48 + 10 * rand('uniform')));
        end;
      end;

             /* (365.25 * 30) = 10957 or 30 years of days */
      dob   = ceil(10957 * rand('uniform')) * ifn(rand('uniform') < .5, 1, -1);
      age   = int(yrdif(dob, date(), 'actual'));
      value = max(1000000 * rand('uniform'), 30000);
      output;
    end;

    stop;  /* terminates this data step */
  run;   
%mend;