# This is a shell archive. Remove anything before this line, # then unpack it by saving it in a file and typing "sh file". # # Wrapped by Ti Lian Hwang <tilh@mars> on Thu Jul 6 15:14:10 1995 # # This archive contains: # dummy.pl h2a_pcl.pl parse-html.pl html-ascii.pl # LANG=""; export LANG PATH=/bin:/usr/bin:$PATH; export PATH echo x - dummy.pl cat >dummy.pl <<'@EOF' # put your drivers in a file # 'h2a_$TERM.pl' where $TERM is the environment variable # sub html_begin_doc {} sub html_end_doc {} sub begin_font {} sub end_font {} 1; @EOF chmod 664 dummy.pl echo x - h2a_pcl.pl cat >h2a_pcl.pl <<'@EOF' # # HP-PCL font functions to be used by html-to-ascii.pl # "required" by parse-html" # # Author : Ti Lian Hwang # email : tilh@sin-ro.sin-ro.dhl.com # # # fixed spacing, 12 pitch, 12 points, Times Roman $normal_font = "\033(s0p12h12v3T"; sub html_begin_doc { # reset, CR=CR,LF=LF+CR,FF=FF+CR print "\033E\033&k2G"; } sub html_end_doc { print "\033E"; } $Begin{"B"} = "begin_font"; $End{"B"} = "end_font"; $Begin{"I"} = "begin_font"; $End{"I"} = "end_font"; $Begin{"U"} = "begin_font"; $End{"U"} = "end_font"; $Begin{"EM"} = "begin_font"; $End{"EM"} = "end_font"; %font_begin = ( "PRE","\033(s0p12h3T", "H1","\033(s1p20v5t3b", "H2","\033(s1p18v5t3b", "H3","\033(s1p16v5t3B", "H4","\033(s1p14v5t3B", "H5","\033(s1p12v5t3B", "H6","\033(s1p12v5t3B", "B","\033(s3B", "U","\033&dD", "I","\033(s1S", "EM","\033(s3B" ); %font_end = ( "PRE",$normal_font, "H1",$normal_font . "\033(s0B", "H2",$normal_font . "\033(s0B", "H3",$normal_font . "\033(s0B", "H4",$normal_font . "\033(s0B", "H5",$normal_font . "\033(s0B", "H6",$normal_font . "\033(s0B", "B","\033(s0B", "U","\033&d@", "I","\033(s0S", "EM","\033(s0B" ); sub begin_font { local ($element, $tag) = @_; print $font_begin{$element}; } sub end_font { local ($element, $tag) = @_; print $font_end{$element}; } 1; @EOF chmod 664 h2a_pcl.pl echo x - parse-html.pl sed 's/^@//' >parse-html.pl <<'@EOF' # HTML parser # Jim Davis, July 15 1994 # This is an HTML parser not an SGML parser. It does not parse a DTD, # The DTD is implicit in the code, and specific to HTML. # The processing of the HTML can be customized by the user by # 1) Defining routines to be called for various tags (see Begin and End arrays) # 2) Defining routines html_content and html_whitespace # This is not a validating parser. It does not check the content model # eg you can use DT outside a DL and it won't know. It is too liberal in # what tags are allowed to minimize what other tags. # Bugs - can't parse the prolog or whatever you call it # <!DOCTYPE HTML [ # <!entity % HTML.Minimal "INCLUDE"> # <!-- Include standard HTML DTD --> # <!ENTITY % html PUBLIC "-//connolly hal.com//DTD WWW HTML 1.8//EN"> # %html; # ]> # modified 3 Aug to add a bunch of HTML 2.0 tags # modified 3 Sept to print HTML stack to STDERR not STDOUT, to add new # routines html_begin_doc and html_end_doc for application specific cleanup # and to break parse_html into two pieces. # modified 30 Sept 94. parse_attributes now handles tag attributes that # don't have values. thanks to Bill Simpson-Young <bill@syd.dit.csiro.au> # for the code. # modified 17 Apr 95 to support FORMS tags. # # modified 6 July 1995 by Ti Lian Hwang <tilh@sin-co.sin-ro.dhl.com> # to handle 'printer drivers' - files to be 'require' depending on $TERM # files are to have name of 'h2a_$TERM.pl' # $filename = "h2a_" . $ENV{"TERM"} . ".pl"; foreach $prefix (@INC) { $realfilename = "$prefix/$filename"; if (-f $realfilename) { require "$filename" ; $filename = ""; last; } } if ($filename) {require "dummy.pl" ; } $debug = 0; $whitespace_significant = 0; # global variables: # $line_buffer is line buffer # $line_count is input line number. $line_buffer = ""; $line_count = 0; sub parse_html { local ($file) = @_; open (HTML, $file) || die "Could not open $file: $!\nStopped"; &parse_html_stream (); close (HTML);} # Global input HTML is the handle to the stream of HTML sub parse_html_stream { local ($token, $new); ## initialization @stack=(); $line_count = 0; $line_buffer = ""; ## application specific initialization &html_begin_doc(); main: while (1) { # if whitespace does not matter, trim any leading space. if (! $whitespace_significant) { $line_buffer =~ s/^\s+//;} # now dispatch on the type of token if ($line_buffer =~ /^(\s+)/) { $token = $1; $line_buffer = $'; &html_whitespace ($token);} # This will lose if there is more than one comment on the line! elsif ($line_buffer =~ /^(\<!--.*-->)/) { $token = $1; $line_buffer = $'; &html_comment ($token);} elsif ($line_buffer =~ /^(\<![^-][^\>]*\>)/) { $token = $1; $line_buffer = $'; &html_comment ($token);} elsif ($line_buffer =~ /^(\<\/[^\>]*\>)/) { $token = $1; $line_buffer = $'; &html_etag ($token);} elsif ($line_buffer =~ /^(\<[^!\/][^\>]*\>)/) { $token = $1; $line_buffer = $'; &html_tag ($token);} elsif ($line_buffer =~ /^([^\s<]+)/) { $token = $1; $line_buffer = $'; $token = &substitute_entities($token); &html_content ($token); } else { # No valid token in buffer. Maybe it's empty, or maybe there's an # incomplete tag. So get some more data. $new = <HTML>; if (! defined ($new)) {last main;} # if we're trying to find a match for a tag, then get rid of embedded newline # this is, I think, a kludge if ($line_buffer =~ /^\</ && $line_buffer =~ /\n$/) { chop $line_buffer; $line_buffer .= " ";} $line_buffer .= $new; $line_count++;} } ## cleanup &html_end_doc(); if ($#stack > -1) { print STDERR "Stack not empty at end of document\n"; &print_html_stack();} } sub html_tag { local ($tag) = @_; local ($element) = &tag_element ($tag); local (%attributes) = &tag_attributes ($tag); # the tag might minimize (be an implicit end) for the previous tag local ($prev_element); while (&Minimizes(&stack_top_element(), $element)) { $prev_element = &stack_pop_element (); if ($debug) { print STDERR "MINIMIZING $prev_element with $element on $line_count\n";} &html_end ($prev_element, 0);} push (@stack, $tag); &html_begin ($element, $tag, *attributes); if (&Empty($element)) { pop(@stack); &html_end ($element, 0);} } sub html_etag { local ($tag) = @_; local ($element) = &tag_element ($tag); # pop stack until find matching tag. This is probably a bad idea, # or at least too general. local ( $prev_element) = &stack_pop_element(); until ($prev_element eq $element) { if ($debug) { print STDERR "MINIMIZING $prev_element with /$element on $line_count \n";} &html_end ($prev_element, 0); if ($#stack == -1) { print STDERR "No match found for /$element. You will lose\n"; last;} $prev_element = &stack_pop_element();} &html_end ($element, 1); } # For each element, the names of elements which minimize it. # This is of course totally HTML dependent and probably I have it wrong too $Minimize{"DT"} = "DT:DD"; $Minimize{"DD"} = "DT"; $Minimize{"LI"} = "LI"; $Minimize{"P"} = "P:DT:LI:H1:H2:H3:H4:BLOCKQUOTE:UL:OL:DL"; # Does element E2 minimize E1? sub Minimizes { local ($e1, $e2) = @_; local ($value) = 0; foreach $elt (split (":", $Minimize{$e1})) { if ($elt eq $e2) {$value = 1;}} $value;} $Empty{"BASE"} = 1; $Empty{"BR"} = 1; $Empty{"HR"} = 1; $Empty{"IMG"} = 1; $Empty{"ISINDEX"} = 1; $Empty{"LINK"} = 1; $Empty{"META"} = 1; $Empty{"NEXTID"} = 1; $Empty{"INPUT"} = 1; # Empty tags have no content and hence no end tags sub Empty { local ($element) = @_; $Empty{$element};} sub print_html_stack { print STDERR "\n ==\n"; foreach $elt (reverse @stack) {print STDERR " $elt\n";} print STDERR " ==========\n";} # The element on top of stack, if any. sub stack_top_element { if ($#stack >= 0) { &tag_element ($stack[$#stack]);}} sub stack_pop_element { &tag_element (pop (@stack));} # The element from the tag, normalized. sub tag_element { local ($tag) = @_; $tag =~ /<\/?([^\s>]+)/; local ($element) = $1; $element =~ tr/a-z/A-Z/; $element;} # associative array of the attributes of a tag. sub tag_attributes { local ($tag) = @_; $tag =~ /^<[A-Za-z]+ +(.*)>$/; &parse_attributes($1);} # string should be something like # KEY="value" KEY2="longer value" KEY3="tags o doom" # output is an associative array (like a lisp property list) # attributes names are not case sensitive, do I downcase them # Maybe (probably) I should substitute for entities when parsing attributes. sub parse_attributes { local ($string) = @_; local (%attributes); local ($name, $val); get: while (1) { if ($string =~ /^ *([A-Za-z]+)=\"([^\"]*)\"/) { $name = $1; $val = $2; $string = $'; $name =~ tr/A-Z/a-z/; $attributes{$name} = $val; } elsif ($string =~ /^ *([A-Za-z]+)=(\S*)/) { $name = $1; $val = $2; $string = $'; $name =~ tr/A-Z/a-z/; $attributes{$name} = $val;} elsif ($string =~ /^ *([A-Za-z]+)/) { $name = $1; $val = ""; $string = $'; $name =~ tr/A-Z/a-z/; $attributes{$name} = $val;} else {last;}} %attributes;} sub substitute_entities { local ($string) = @_; $string =~ s/&/&/g; $string =~ s/</</g; $string =~ s/>/>/g; $string =~ s/"/\"/g; $string;} @@HTML_elements = ( "A", "ADDRESS", "B", "BASE", "BLINK", # Netscape addition :-( "BLOCKQUOTE", "BODY", "BR", "CITE", "CENTER", # Netscape addition :-( "CODE", "DD", "DIR", "DFN", "DL", "DT", "EM", "FORM", "H1", "H2", "H3", "H4", "H5", "H6", "HEAD", "HR", "HTML", "I", "ISINDEX", "IMG", "INPUT", "KBD", "LI", "LINK", "MENU", "META", "NEXTID", "OL", "OPTION", "P", "PRE", "SAMP", "SELECT", "STRIKE", "STRONG", "TITLE", "TEXTAREA", "TT", "U", "UL", "VAR", ); sub define_element { local ($element) = @_; $Begin{$element} = "Noop"; $End{$element} = "Noop";} foreach $element (@HTML_elements) {&define_element($element);} # do nothing sub Noop { local ($element, $xxx) = @_;} # called when a tag begins. Dispatches using Begin sub html_begin { local ($element, $tag, *attributes) = @_; local ($routine) = $Begin{$element}; if ($routine eq "") { print STDERR "Unknown HTML element $element ($tag) on line $line_count\n";} else { &begin_font ($element, $explicit); eval "&$routine;"} } # called when a tag ends. Explicit is 0 if tag end is because of minimization # not that you should care. sub html_end { local ($element, $explicit) = @_; local ($routine) = $End{$element}; if ($routine eq "") { print STDERR "Unknown HTML element \"$element\" (END $explicit) on line $line_count\n";} else { eval "&$routine(\"$element\", $explicit)"; &end_font ($element, $explicit); } } sub html_content { local ($word) = @_; } sub html_whitespace { local ($whitespace) = @_;} sub html_comment { local ($tag) = @_;} # redefine these for application-specific initialization and cleanup # sub html_begin_doc {} # sub html_end_doc {} # return a "true value" when loaded by perl. 1; @EOF chmod 644 parse-html.pl echo x - html-ascii.pl sed 's/^@//' >html-ascii.pl <<'@EOF' # Routines for HTML to ASCII. # (fixed width font, no font changes for size, bold, etc) with a little # BUGS AND MISSING FEATURES # font tags (e.g. CODE, EM) cause an extra whitespace # e.g. <TT>foo</TT>, -> foo , # Jim Davis July 15 1994 # modified 3 Aug 94 to support MENU and DIR require "tformat.pl" || die "Could not load tformat.pl: $@\nStopped"; # Can be set by command line arg if (! defined($columns_per_line)) { $columns_per_line = 72;} if (! defined($flush_last_page)) { $flush_last_page = 1;} # amount to add to indentation $indent_left = 5; $indent_right = 5; # ignore contents inside HEAD. $ignore_text = 0; # Set variables in tformat $left_margin = 1; $right_margin = $columns_per_line; $bottom_margin = 0; ## Routines called by html.pl $Begin{"HEAD"} = "begin_head"; $End{"HEAD"} = "end_head"; sub begin_head { local ($element, $tag) = @_; $ignore_text = 1;} sub end_head { local ($element) = @_; $ignore_text = 0;} $Begin{"BODY"} = "begin_document"; sub begin_document { local ($element, $tag) = @_; &start_page();} $End{"BODY"} = "end_document"; sub end_document { local ($element) = @_; &fresh_line();} ## Headers $Begin{"H1"} = "begin_header"; $End{"H1"} = "end_header"; $Begin{"H2"} = "begin_header"; $End{"H2"} = "end_header"; $Begin{"H3"} = "begin_header"; $End{"H3"} = "end_header"; $Begin{"H4"} = "begin_header"; $End{"H4"} = "end_header"; $Skip_Before{"H1"} = 1; $Skip_After{"H1"} = 1; $Skip_Before{"H2"} = 1; $Skip_After{"H2"} = 1; $Skip_Before{"H3"} = 1; $Skip_After{"H3"} = 0; sub begin_header { local ($element, $tag) = @_; &skip_n_lines ($Skip_Before{$element}, 5);} sub end_header { local ($element) = @_; &skip_n_lines ($Skip_After{$element});} $Begin{"BR"} = "line_break"; sub line_break { local ($element, $tag) = @_; &fresh_line();} $Begin{"P"} = "begin_paragraph"; # if fewer than this many lines left on page, start new page $widow_cutoff = 5; sub begin_paragraph { local ($element, $tag) = @_; &skip_n_lines (1, $widow_cutoff);} $Begin{"BLOCKQUOTE"} = "begin_blockquote"; $End{"BLOCKQUOTE"} = "end_blockquote"; sub begin_blockquote { local ($element, $tag) = @_; $left_margin += $indent_left; $right_margin = $columns_per_line - $indent_right; &skip_n_lines (1);} sub end_blockquote { local ($element) = @_; $left_margin -= $indent_left; $right_margin = $columns_per_line; &skip_n_lines (1);} $Begin{"PRE"} = "begin_pre"; $End{"PRE"} = "end_pre"; sub begin_pre { local ($element, $tag) = @_; $whitespace_significant = 1;} sub end_pre { local ($element) = @_; $whitespace_significant = 0;} $Begin{"INPUT"} = "form_input"; sub form_input { local ($element, $tag, *attributes) = @_; if ($attributes{"value"} ne "") { &print_word_wrap($attributes{"value"});}} $Begin{"HR"} = "horizontal_rule"; sub horizontal_rule { local ($element, $tag) = @_; &fresh_line (); &print_n_chars ($right_margin - $left_margin, "-");} # Add code for IMG (use ALT attribute) # Ignore I, B, EM, TT, CODE (no font changes) ## List environments $Begin{"UL"} = "begin_itemize"; $End{"UL"} = "end_list_env"; $Begin{"OL"} = "begin_enumerated"; $End{"OL"} = "end_list_env"; $Begin{"MENU"} = "begin_menu"; $End{"MENU"} = "end_list_env"; $Begin{"DIR"} = "begin_dir"; $End{"DIR"} = "end_list_env"; $Begin{"LI"} = "begin_list_item"; @@list_stack = (); $list_type = "bullet"; $list_counter = 0; sub push_list_env { push (@list_stack, join (":", $list_type, $list_counter));} sub pop_list_env { ($list_type, $list_counter) = split (":", pop (@list_stack)); $left_margin -= $indent_left;} sub begin_itemize { local ($element, $tag) = @_; &push_list_env(); $left_margin += $indent_left; $list_type = "bullet"; $list_counter = "*";} sub begin_menu { local ($element, $tag) = @_; &push_list_env(); $left_margin += $indent_left; $list_type = "bullet"; $list_counter = "*";} sub begin_dir { local ($element, $tag) = @_; &push_list_env(); $left_margin += $indent_left; $list_type = "bullet"; $list_counter = "*";} sub begin_enumerated { local ($element, $tag) = @_; &push_list_env(); $left_margin += $indent_left; $list_type = "enumerated"; $list_counter = 1;} sub end_list_env { local ($element) = @_; &pop_list_env(); # &fresh_line(); } sub begin_list_item { local ($element, $tag) = @_; $left_margin -= 2; &fresh_line(); &print_word_wrap("$list_counter "); if ($list_type eq "enumerated") {$list_counter++;} $left_margin += 2;} $Begin{"DL"} = "begin_dl"; sub begin_dl { local ($element, $tag) = @_; &skip_n_lines(1,5);} $Begin{"DT"} = "begin_defined_term"; $Begin{"DD"} = "begin_defined_definition"; $End{"DD"} = "end_defined_definition"; sub begin_defined_term { local ($element, $tag) = @_; &fresh_line();} sub begin_defined_definition { local ($element, $tag) = @_; $left_margin += $indent_left; &fresh_line();} sub end_defined_definition { local ($element) = @_; $left_margin -= $indent_left; &fresh_line();} $Begin{"META"} = "begin_meta"; # a META tag sets a value in the assoc array %Variable # i.e. <META name="author" content="Rushdie"> sers $Variable{author} to "Rushdie" sub begin_meta { local ($element, $tag, *attributes) = @_; local ($variable, $value); $variable = $attributes{name}; $value = $attributes{content}; $Variable{$variable} = $value;} $Begin{"IMG"} = "begin_img"; sub begin_img { local ($element, $tag, *attributes) = @_; &print_word_wrap (($attributes{"alt"} ne "") ? $attributes{"alt"} : "[IMAGE]");} # Content and whitespace. sub html_content { local ($string) = @_; unless ($ignore_text) { &print_word_wrap ($string);}} sub html_whitespace { local ($string) = @_; if (! $whitespace_significant) { die "Internal error, called html_whitespace when whitespace was not significant";} local ($i); for ($i = 0; $i < length ($string); $i++) { &print_whitespace (substr($string,$i,1));}} # called by tformat. Do nothing. sub do_footer { } sub do_header { } 1; @EOF chmod 644 html-ascii.pl exit 0