Perl script to parse html file containing Industry performance. Watch out for line wrap.. Comments begin with #
Paul ------------------- perl script starts here ------------------------ #!/bin/perl # # ind_perf.pl - Process Industry performance htm file # create a comma-delimited text file which can be read into Excel. # Output is to STDOUT. # # Usage: perl ind_perf.pl > perf.txt # # Prepared March 8, 1997 - Paul Beattie #
#--------------------------------------------------------------------- # main routine
# process performance file # HTML has been downloaded from ssultra.snapshot.com # and stored in location specified below.
$filename = "C:\\Paul\\INVEST\\TA_Perl\\industry.htm"; # path to input file
process_ind_html ($filename);
#------------------------------------------------------------ # subroutine process_ind_html # This routine does the processing, to extract appropriate lines from the html file. # It rearranges data by rows and creates comma-delimited output for use in Excel. # # Input file specifies Title, Headings, Columns in separate sections of HTML.
#Usage: process_ind_html (.html filename)
# Processing States are: FINDTITLE, TITLE, FINDHEADINGS, HEADINGS, COLUMNS, QUIT sub process_ind_html { local($filename) = @_; $HEADING = ""; # initialize Heading text $col = 0; # initialize column number # define patterns expected in html file. $TITLE_PAT = "<CAPTION"; # titles follow this pattern - may be multiple lines $END_TITLE = "</CAPTION"; # titles end with this pattern $HEADING_PAT = "<TR>"; # headings start with this pattern $END_HEADINGS = "</TR>"; # headings end with this pattern $NEWCOL_PAT = "<TD"; # new columns begin with this pattern $ENDCOL_PAT = "(</TD)"; # ignore lines containing </TD $QUIT_PAT = "</TABLE";
# Open file for processing unless (open(INPUT, $filename)) { print STDERR "Can't open $filename: $!\n"; return; }
local($state) = "FINDTITLE"; # initial state # process text in file. while (<INPUT>) {
next if ($state eq "QUIT"); # ignore lines to end of file
#--------------Find and Process Title (Captions)-------------------- if ($state eq "FINDTITLE" && /^$TITLE_PAT/) { $state = "TITLE"; } if ($state eq "TITLE" ) { if (/^$END_TITLE/) { $state = "FIND_HEADINGS"; next; } else { chop; # get rid of \n at end of line. s/<BR>/ /g; # replace <BR> with blanks s/,/ /g; # replace any commas (we use as delim.) with blanks s/<[^>]+>//g; # get rid of HTML commands print $_, "\n"; next; } }
#--------------Find and Process Column Headings-------------------- if ($state eq "FIND_HEADINGS" && /^$HEADING_PAT/) { $state = "HEADINGS"; $first_flag = TRUE; next; } if ($state eq "HEADINGS" ) { if (/^$END_HEADINGS/) { print $HEADING, "\n"; $state = "COLUMNS"; next; } else { # process heading text chop; # get rid of \n at end of line. s/<BR>/ /g; # replace <BR> with blanks s/<\/TH><TH/<\/TH>,<TH/g; # handle multiple headings in a line # - insert comma between heading "commands" s/<[^>]+>//g; # get rid of HTML commands s/^ *//; # get rid of leading blanks s/ *$//; # get rid of trailing blanks s/ *, */,/; # in case of blanks around comma delimeter
if ($first_flag eq TRUE) { # don't include comma first time through. $HEADING = $_ ; $first_flag = FALSE; } else { $HEADING .= "," . $_ ; } next; } }
#--------------Find and Process Columns--------------------
if ( $state eq "COLUMNS" ) { if ( /$QUIT_PAT/ ) { $state = "QUIT"; next; }
if ( /$NEWCOL_PAT/ ) { $col++; $row = 0; next; }
if ( /<TR>/ ) { # ignore this line next; }
if ( /$ENDCOL_PAT/ ) { next; }
chop; # get rid of \n at end of line. s/<BR>//g; # replace <BR> with nothing s/<[^>]+>//g; # get rid of HTML commands s/\&/\&/g; # replace "&" text with ampersand s/^ *//; # get rid of leading blanks s/ *$//; # get rid of trailing blanks
if ($col > 1) { if ($row <= $#array) { $array[$row] .= "," . $_ ; } else { # this is a new row (strange) # insert leading commas to delimit push( @array, ("," x $col) . $_ ); } } else { push(@array, $_ ) ; } $row++; } } close(INPUT);
#--------------Print array by row-------------------- { local($,,$/) = ("\n","\n"); # print one element per line with final newline print @array; } }
# -- end of ind_perf.pl |