SI
SI
discoversearch

We've detected that you're using an ad content blocking browser plug-in or feature. Ads provide a critical source of revenue to the continued operation of Silicon Investor.  We ask that you disable ad blocking while on Silicon Investor in the best interests of our community.  If you are not using an ad blocker but are still receiving this message, make sure your browser's tracking protection is set to the 'standard' level.
Strategies & Market Trends : TA- Scans and System Tests

 Public ReplyPrvt ReplyMark as Last ReadFilePrevious 10Next 10PreviousNext  
To: Paul Beattie who wrote (392)3/8/1997 8:39:00 PM
From: Paul Beattie   of 989
 
Perl script to parse html file containing Industry performance.
Watch out for line wrap.. Comments begin with #

Paul
------------------- perl script starts here ------------------------
#!/bin/perl
#
# ind_perf.pl - Process Industry performance htm file
# create a comma-delimited text file which can be read into Excel.
# Output is to STDOUT.
#
# Usage: perl ind_perf.pl > perf.txt
#
# Prepared March 8, 1997 - Paul Beattie
#

#---------------------------------------------------------------------
# main routine

# process performance file
# HTML has been downloaded from ssultra.snapshot.com
# and stored in location specified below.

$filename = "C:\\Paul\\INVEST\\TA_Perl\\industry.htm"; # path to input file

process_ind_html ($filename);

#------------------------------------------------------------
# subroutine process_ind_html
# This routine does the processing, to extract appropriate lines from the html file.
# It rearranges data by rows and creates comma-delimited output for use in Excel.
#
# Input file specifies Title, Headings, Columns in separate sections of HTML.

#Usage: process_ind_html (.html filename)

# Processing States are: FINDTITLE, TITLE, FINDHEADINGS, HEADINGS, COLUMNS, QUIT
sub process_ind_html {
local($filename) = @_;
$HEADING = ""; # initialize Heading text
$col = 0; # initialize column number

# define patterns expected in html file.
$TITLE_PAT = "<CAPTION"; # titles follow this pattern - may be multiple lines
$END_TITLE = "</CAPTION"; # titles end with this pattern
$HEADING_PAT = "<TR>"; # headings start with this pattern
$END_HEADINGS = "</TR>"; # headings end with this pattern
$NEWCOL_PAT = "<TD"; # new columns begin with this pattern
$ENDCOL_PAT = "(</TD)"; # ignore lines containing </TD
$QUIT_PAT = "</TABLE";


# Open file for processing
unless (open(INPUT, $filename)) {
print STDERR "Can't open $filename: $!\n";
return;
}

local($state) = "FINDTITLE"; # initial state

# process text in file.
while (<INPUT>) {

next if ($state eq "QUIT"); # ignore lines to end of file


#--------------Find and Process Title (Captions)--------------------
if ($state eq "FINDTITLE" && /^$TITLE_PAT/) {
$state = "TITLE";
}
if ($state eq "TITLE" ) {
if (/^$END_TITLE/) {
$state = "FIND_HEADINGS";
next;
}
else {
chop; # get rid of \n at end of line.
s/<BR>/ /g; # replace <BR> with blanks
s/,/ /g; # replace any commas (we use as delim.) with blanks
s/<[^>]+>//g; # get rid of HTML commands
print $_, "\n";
next;
}
}

#--------------Find and Process Column Headings--------------------
if ($state eq "FIND_HEADINGS" && /^$HEADING_PAT/) {
$state = "HEADINGS";
$first_flag = TRUE;
next;
}
if ($state eq "HEADINGS" ) {
if (/^$END_HEADINGS/) {
print $HEADING, "\n";
$state = "COLUMNS";
next;
}
else { # process heading text
chop; # get rid of \n at end of line.
s/<BR>/ /g; # replace <BR> with blanks
s/<\/TH><TH/<\/TH>,<TH/g; # handle multiple headings in a line
# - insert comma between heading "commands"
s/<[^>]+>//g; # get rid of HTML commands
s/^ *//; # get rid of leading blanks
s/ *$//; # get rid of trailing blanks
s/ *, */,/; # in case of blanks around comma delimeter

if ($first_flag eq TRUE) { # don't include comma first time through.
$HEADING = $_ ;
$first_flag = FALSE;
}
else {
$HEADING .= "," . $_ ;
}
next;
}
}

#--------------Find and Process Columns--------------------

if ( $state eq "COLUMNS" ) {
if ( /$QUIT_PAT/ ) {
$state = "QUIT";
next;
}

if ( /$NEWCOL_PAT/ ) {
$col++;
$row = 0;
next;
}

if ( /<TR>/ ) { # ignore this line
next;
}

if ( /$ENDCOL_PAT/ ) {
next;
}

chop; # get rid of \n at end of line.
s/<BR>//g; # replace <BR> with nothing
s/<[^>]+>//g; # get rid of HTML commands
s/\&amp;/\&/g; # replace "&amp;" text with ampersand
s/^ *//; # get rid of leading blanks
s/ *$//; # get rid of trailing blanks

if ($col > 1) {
if ($row <= $#array) {
$array[$row] .= "," . $_ ;
}
else { # this is a new row (strange)
# insert leading commas to delimit
push( @array, ("," x $col) . $_ );
}
}
else {
push(@array, $_ ) ;
}
$row++;
}
}
close(INPUT);

#--------------Print array by row--------------------
{
local($,,$/) = ("\n","\n"); # print one element per line with final newline
print @array;
}
}

# -- end of ind_perf.pl
Report TOU ViolationShare This Post
 Public ReplyPrvt ReplyMark as Last ReadFilePrevious 10Next 10PreviousNext