#!/usr/bin/perl -w # This program will parse out a table use HTML::Parser; our $rowcount = -1; our $colcount = 0; our @table; our @text; # set up our html parser my $p = HTML::Parser->new(api_version => 3, start_h => [\&t_start_handler, "self,tagname,attr"], end_h => [\&t_end_handler, "self,tagname,attr"], report_tags => [qw(tr td th)], ); $p->parse_file(shift || die) || die $!; # at this point the %table array of arrays should be populated foreach $row (0 .. $#table) { print "row $row:"; foreach $col (@{ $table[$row] }) { print "\t$col"; } print "\n"; } ############################################################ # subroutines start here ############################################################ sub t_start_handler { my($self, $tag, $attr) = @_; if($tag eq 'tr') { $rowcount++; $colcount = 0; } if($tag eq 'td' || $tag eq 'th') { $self->handler(text => \&hash_text, "dtext"); $colcount++; } } sub hash_text { my $text = shift; chomp $text; $text =~ s/\s{2,}//g; # print $text; if($text =~ /^$/){ return }; if($text =~ /^\s+$/){ return }; push(@{ $table[$rowcount] }, $text); } sub t_end_handler { my($self, $tag) = @_; $self->handler("text", undef); $self->handler("start", \&t_start_handler); $self->handler("end", undef); }