SyntaxHighlighter

Thursday, June 28, 2012

Drop Missing Columns

The below macro was created to read all rows of every column and identify columns that only contain missing values. This comes in very handy in removing clutter (missing values) and more easily focus on columns contianing values instead of scrolling across the screen.

The code runs amazingly fast, about 1 million rows with 30 columns per second on my notebook.

%macro dropmissingcolumns( dsn ) ;
  %local lib
         mem 
         rowcount
         columncount
         jobstarttime ;

  %if %sysfunc( exist( &dsn. ) ) = 0 %then %do ;
    %if %sysfunc( exist( &dsn.,view ) ) = 0 %then %do ;
      %put ERROR: The data set &dsn does not exist. ;
      %return ;
    %end ;
  %end ;

  %let rowcount = %dsnobs( dsn = &dsn ) ;
  %if &rowcount = 0 %then %do ;
    %put NOTE: The data set &dsn contains zero rows of data.  Nothing to do. ;
    %return ;
  %end ;
  %let jobstarttime = %sysfunc( datetime() ) ;

  %if %index( &dsn., . ) > 0 %then %do ;
    %let lib = %upcase( %scan( &dsn., 1 ) ) ;
    %let mem = %upcase( %scan( &dsn., 2 ) ) ;
  %end ;
  %else %do ;
    %let lib = WORK ;
    %let mem = %upcase( &dsn. ) ;
  %end ;

  proc sql noprint ;
    select    name
            , count( name )
      into    :charlist separated by ' ' 
            , :charcount
      from    dictionary.columns
      where       libname = "&lib"
              and memname = "&mem" 
              and type    = 'char' ;

    select    name
            , count( name )
      into    :numlist separated by ' '
            , :numcount
      from    dictionary.columns
      where       libname = "&lib" 
              and memname = "&mem"
              and type    = 'num' ;
  quit ;

  %let columncount = %eval( &charcount. + &numcount. ) ;
  data _null_  ;
    length droplist $8192 ;
    set &dsn. end = eof ;

    %if &charcount. > 0 %then %do ;
      array charvars( * ) $ &charlist. ;
      array c_allmiss ( &charcount. ) $1 ( &charcount. * 'Y' ) ;
    %end ;

    %if &numcount. > 0 %then %do ;
      array numvars( * ) &numlist. ;
      array n_allmiss ( &numcount. )  $1 ( &numcount.  * 'Y' ) ;
    %end ;

    do i = 1 to &charcount. ;
      if not missing( charvars( i ) ) then c_allmiss( i ) = 'N' ;
    end ;
    do i = 1 to &numcount. ;
      if not missing( numvars( i ) )  then n_allmiss( i ) = 'N' ;
    end ;

    if eof then do ;
      dropcount = 0 ;
      droplist = '' ;
      do i = 1 to &charcount. ;
        if c_allmiss( i ) = 'Y' then do ;
          dropcount + 1 ;
          droplist = catx( " ", droplist, vname( charvars( i ) ) ) ;
        end ;
      end ;
      do i = 1 to &numcount. ;
        if n_allmiss( i ) = 'Y' then do ;
          dropcount + 1 ;
          droplist = catx( " ", droplist, vname( numvars( i ) ) ) ;
        end ;
      end ;
    end ;

    call symputx( 'droplist', droplist, 'l' ) ;
    call symputx( 'dropcount', dropcount, 'l' ) ;
  run ;

  %if &dropcount. > 0 %then %do ;
    data &dsn ;
      set &dsn( drop = &droplist ) ;
    run ;
  %end ;

  %put NOTE: dropped &dropcount missing of &columncount columns from &rowcount rows in %sysfunc( strip( %sysfunc( putn( %sysevalf( %sysfunc( datetime()) - &jobstarttime.),9.3) ) ) ) seconds. ;
%mend ;
Here is the source code on the %dsnobs(dsn=) macro used in the above code:
%macro dsnobs( dsn= ) ;
  %local nobs dsid rc  ;
  %let nobs = . ;
  %let dsid = %sysfunc( open( &dsn, i ) ) ; 
 
  %if &dsid %then %do ;
    %let nobs = %sysfunc( attrn( &dsid, nobs ) ) ; 
    %let rc = %sysfunc(close( &dsid ) ) ;
  %end ;

  &nobs 
%mend ;

Wednesday, June 27, 2012

Directory Listing without FILENAME PIPE

In the past when I needed a list of files in a directory I would use the filename command with the pipe device type as follows:
  filename dirlist pipe 'dir "c:\temp\" 
However, under Windows 7, I am getting the following error message noted here:
Stderr output:
There is not enough space on the disk.
NOTE: 0 records were read from the infile DIR.
Also, when code is sent to an IOM server that uses the FILENAME PIPE command, the code fails with ERROR: Access is denied or ERROR: Insufficient authorization. As a result, I wrote the below macro that utilizes SAS component language (SCL) functions to get around these restrictions.
%macro directorylisting
  (
      path   =
    , outdsn = dirlist
    , where  =
    , after  = 01Jan1960
  ) ;
 
  data &outdsn. ( keep = filename fullfilename created modified bytes ) ;
    attrib 
      dref         length = $8
      fref         length = $8
      folder       length = $256
      filename     length = $128 label = 'Filename'
      fullfilename length = $256 label = 'Full Filename'
      created      length = 8    label = 'Created'   format = datetime19.
      modified     length = 8    label = 'Modified'  format = datetime19.
      bytes        length = 8    label = 'Bytes'     format = comma15. ;
 
    if fileexist( "&path." ) then do ;
      folder = ifc( substr( "&path.", lengthn( "&path." ), 1 ) = "\"
        , "&path.", cats( "&path.", "\" ) ) ;
      rc     = filename( dref, folder );
      did    = dopen( dref ) ;
      dcount = dnum( did ) ;
 
      do i = 1 to dcount ;
        filename = dread( did, i ) ;
       
        if find( filename, "&where.", 'i' ) > 0 then do ;
          fullfilename = cats( folder, filename ) ;
          rc  = filename( fref, fullfilename ) ;
          fid = fopen( fref ) ;
          modified = input( finfo( fid, 'Last Modified' ), anydtdtm. ) ;
          if datepart( modified ) >= "&after."d then do ;
            created = input( finfo( fid, 'Create Time' ), anydtdtm. ) ;
            bytes   = input( finfo( fid, 'File Size (bytes)' ), 18. ) ;
            output ;
          end ;
          fid = fclose( fid ) ;
          rc  = filename( fref, '' ) ;
        end ;
      end ;
 
      did = dclose( did ) ;
      rc  = filename( dref, '' ) ;
    end ;
    else put "ERROR: The folder &path. does not exist." ;
  run ;
%mend ;

Thursday, June 7, 2012

GetAttribs( dsn )

If you ever need to add a column in the middle of a SAS data set it can be problematic since the program data vector (PDV) controls the order. As a result, I wrote the below macro to expedite the process by writing column attributes to the SAS log. From there I can review then copy and paste the content into a new program.
%macro getattribs( dsn )  ;
  %if %sysfunc( exist( &dsn ) ) %then %do ;
    %let dsn = %upcase( &dsn ) ;
    %if %index( &dsn, . ) %then %do ;
      %let libname = %scan( &dsn, 1, . ) ;
      %let memname = %scan( &dsn, 2, . ) ;
    %end ;
    %else %do ;
      %let libname = WORK ;
      %let memname = &dsn ;
    %end ;
    
    data _null_ ;
      length line $512 ;
      put /  "Attributes of data set: &dsn"
         //  'attrib' ;
      do until( eof ) ;
        set sashelp.vcolumn end = eof ;
        where libname = "&libname" and memname = "&memname"  ;
        line = cat( strip( name )
          , ' length = ', ifc( type = 'char', '$', '' ), strip( put( length, 5. ) )
          , ifc( missing( label ), ' ', cat( ' label = "', strip( label ), '" '  ) )
          , ifc( missing( informat ), ' ', catx( ' ', ' informat =', informat ) ), ' '
          , ifc( missing( format ), ' ', catx( ' ', ' format =', format ) )
        ) ;
        put +2 line ;
     end ;
     put ';'  ;
     stop ;
   run ;
 %end ;
 %else %put NOTE: Data set &dsn does not exist. ;
%mend ;

%getattribs( sashelp.cars )

Attributes of data set: SASHELP.CARS attrib Make length = $13 Model length = $40 Type length = $8 Origin length = $6 DriveTrain length = $5 MSRP length = 8 format = DOLLAR8. Invoice length = 8 format = DOLLAR8. EngineSize length = 8 label = "Engine Size (L)" Cylinders length = 8 Horsepower length = 8 MPG_City length = 8 label = "MPG (City)" MPG_Highway length = 8 label = "MPG (Highway)" Weight length = 8 label = "Weight (LBS)" Wheelbase length = 8 label = "Wheelbase (IN)" Length length = 8 label = "Length (IN)" ; NOTE: There were 15 observations read from the data set SASHELP.VCOLUMN. WHERE (libname='SASHELP') and (memname='CARS');

Friday, June 1, 2012

Timer Macro

The TIMER macro writes the elapsed time between calling this macro with MODE=ON and MODE=OFF. This comes in very handy when you want to find the slow parts of your code.

%macro timer
  (   mode = ON
    , description = Duration
    , reset = Y
  ) ;
  %global timermacro_starttime ;

  %if %upcase( &mode ) = OFF %then %do ;
    %if %symexist( timermacro_starttime ) %then do ;
      %put &description %sysfunc( putn( %sysevalf( %sysfunc( datetime() )
        - &timermacro_starttime ), 12.3 ) ) seconds @ %sysfunc( time(), time8. ) ;
      %if %upcase( &reset ) = Y %then %symdel timermacro_starttime ;
  %end ;
  %else %let timermacro_starttime = %sysfunc( datetime() ) ;
%mend ;
Example:

  %timer()

    data _null_ ;
    run ;

  %timer( mode = off ) 

    Duration 0.153 seconds