Pages

SyntaxHighlighter

Thursday, June 28, 2012

Drop Missing Columns

The below macro was created to read all rows of every column and identify columns that only contain missing values. This comes in very handy in removing clutter (missing values) and more easily focus on columns contianing values instead of scrolling across the screen.

The code runs amazingly fast, about 1 million rows with 30 columns per second on my notebook.

%macro dropmissingcolumns( dsn ) ;
  %local lib
         mem 
         rowcount
         columncount
         jobstarttime ;

  %if %sysfunc( exist( &dsn. ) ) = 0 %then %do ;
    %if %sysfunc( exist( &dsn.,view ) ) = 0 %then %do ;
      %put ERROR: The data set &dsn does not exist. ;
      %return ;
    %end ;
  %end ;

  %let rowcount = %dsnobs( dsn = &dsn ) ;
  %if &rowcount = 0 %then %do ;
    %put NOTE: The data set &dsn contains zero rows of data.  Nothing to do. ;
    %return ;
  %end ;
  %let jobstarttime = %sysfunc( datetime() ) ;

  %if %index( &dsn., . ) > 0 %then %do ;
    %let lib = %upcase( %scan( &dsn., 1 ) ) ;
    %let mem = %upcase( %scan( &dsn., 2 ) ) ;
  %end ;
  %else %do ;
    %let lib = WORK ;
    %let mem = %upcase( &dsn. ) ;
  %end ;

  proc sql noprint ;
    select    name
            , count( name )
      into    :charlist separated by ' ' 
            , :charcount
      from    dictionary.columns
      where       libname = "&lib"
              and memname = "&mem" 
              and type    = 'char' ;

    select    name
            , count( name )
      into    :numlist separated by ' '
            , :numcount
      from    dictionary.columns
      where       libname = "&lib" 
              and memname = "&mem"
              and type    = 'num' ;
  quit ;

  %let columncount = %eval( &charcount. + &numcount. ) ;
  data _null_  ;
    length droplist $8192 ;
    set &dsn. end = eof ;

    %if &charcount. > 0 %then %do ;
      array charvars( * ) $ &charlist. ;
      array c_allmiss ( &charcount. ) $1 ( &charcount. * 'Y' ) ;
    %end ;

    %if &numcount. > 0 %then %do ;
      array numvars( * ) &numlist. ;
      array n_allmiss ( &numcount. )  $1 ( &numcount.  * 'Y' ) ;
    %end ;

    do i = 1 to &charcount. ;
      if not missing( charvars( i ) ) then c_allmiss( i ) = 'N' ;
    end ;
    do i = 1 to &numcount. ;
      if not missing( numvars( i ) )  then n_allmiss( i ) = 'N' ;
    end ;

    if eof then do ;
      dropcount = 0 ;
      droplist = '' ;
      do i = 1 to &charcount. ;
        if c_allmiss( i ) = 'Y' then do ;
          dropcount + 1 ;
          droplist = catx( " ", droplist, vname( charvars( i ) ) ) ;
        end ;
      end ;
      do i = 1 to &numcount. ;
        if n_allmiss( i ) = 'Y' then do ;
          dropcount + 1 ;
          droplist = catx( " ", droplist, vname( numvars( i ) ) ) ;
        end ;
      end ;
    end ;

    call symputx( 'droplist', droplist, 'l' ) ;
    call symputx( 'dropcount', dropcount, 'l' ) ;
  run ;

  %if &dropcount. > 0 %then %do ;
    data &dsn ;
      set &dsn( drop = &droplist ) ;
    run ;
  %end ;

  %put NOTE: dropped &dropcount missing of &columncount columns from &rowcount rows in %sysfunc( strip( %sysfunc( putn( %sysevalf( %sysfunc( datetime()) - &jobstarttime.),9.3) ) ) ) seconds. ;
%mend ;
Here is the source code on the %dsnobs(dsn=) macro used in the above code:
%macro dsnobs( dsn= ) ;
  %local nobs dsid rc  ;
  %let nobs = . ;
  %let dsid = %sysfunc( open( &dsn, i ) ) ; 
 
  %if &dsid %then %do ;
    %let nobs = %sysfunc( attrn( &dsid, nobs ) ) ; 
    %let rc = %sysfunc(close( &dsid ) ) ;
  %end ;

  &nobs 
%mend ;

No comments:

Post a Comment