#!/usr/bin/perl -w ###################################################################### # arsforum_user_grab -- by greenfly # # This program will parse a specified Ars Forum Index page, # then note all accounts which have posted in the most # recent threads on that index page. # # The script will output the usernames in order of most # recent post, and then print a filter that is compatible # with privoxy to filter those names # # usage: arsforum_user_grab [] # ###################################################################### use LWP::UserAgent; use HTTP::Request::Common; use HTML::Parser; use Date::Manip qw(ParseDate UnixDate); my $default = 'Battlefront'; our $wait = 0; # number of seconds to wait before loading the next page (so as not to spam the forum) our $output_filter = 0; # print the privoxy filter our $output_grab = 0; # print the forum scraping feedback our $output_post_url = 1; # print the URL of the last post for each user our $output_html = 1; # print output in HTML our $max_count = 100; # number of users to output, 0 means all our %fora = ( 'Audio/Visual Club' => "67909965", 'Case and Cooling Fetish' => "77909585", 'CPU & Motherboard Technologia' => "77909774", 'Mobile Computing Outpost' => "579009962631", 'Networking Matrix' => "469092836", 'Other Hardware' => "24609792", 'Agora Classifieds' => "57909216", 'Battlefront' => "48409524", 'Microsoft OS & Software Colloquium' => "99609816", 'Linux Kung Fu' => "96509133", 'NT, Win2K, & XP Technical Mojo' => "12009443", 'Distributed Computing Arcana' => "122097561", 'Macintoshian Achaia' => "8300945231", "Programmer's Symposium" => "6330927813", 'Gaming, Extra Strength Caplets' => "39309975", 'The Lounge' => "34709834", 'The Soap Box' => "28609695", 'The Boardroom' => "599009962631", 'Ars Technica News & Discussion' => "174096756", 'The Velvet Room' => "8390901411", 'OpenForum Feedback & Suggestions' => "51009562", 'Ars PDF Technical Library' => "5850957912", 'Subscription Support & Service' => "6490940022", 'Ars Emporium Customer Service' => "6550932203", "The Moderators' Quorum" => "4680902032", "The Writers' Block" => "6220903374", "Dungeon Masters' back room" => "103007483631", ); our $forum; if(defined $ARGV[0]) { $forum = $fora{$ARGV[0]}; print "Get $ARGV[0] Index\n " if($output_grab); } else { $forum = $fora{$default}; print "Get $default Index\n " if($output_grab); } our $base = "http://episteme.arstechnica.com/eve/ubb.x?"; our $site = "50009562"; our $index = $base . "a=frm\&s=$site\&f=$forum"; our %index; our %users; my @userlist; my $url; # grab the Forum's index page $page = grab_page($index) or die "\nCan't open $index, nothing to parse!\n"; # then parse it parse_index($page); # parse the last page of each thread in the index for user names foreach(keys %index) { print "\"$index{$_}{'title'}\" - Started by: $index{$_}{'author'} - Page $index{$_}{'lastpage'}\n " if($output_grab); $url = $base . "a=tpc\&s=$site\&f=$forum\&m=$_\&p=$index{$_}{'lastpage'}"; $page = grab_page($url); parse_topic($url, $page, $index{$_}{'title'}); sleep $wait; } # at this point all of the usernames have been gathered. # Now output them sorted by date $count = 1; foreach(sort by_date keys %users) { last if(($max_count > 0) && ($count > $max_count)); if($output_html) { print "$count: $users{$_}\t$_\t--\t$users{$_}{'title'}
\n"; } else { print "$count: $users{$_}\t$_"; if($output_post_url){ print "\t$users{$_}{'url'}\n";} else{ print "\n"; } } push(@userlist, $_); $count++; } if($output_filter) { #Now output the privoxy filter print "-" x 20 , "start default.filter" , "-" x 20, "\n"; print 's/]+>[^<]+[^<]+]+>]+><\/a>]+><[^>]+>('; print join "|", @userlist; print ')<\/a><\/span>/
/gsU' . "\n"; print "-" x 20 , "end default.filter" , "-" x 23, "\n"; } #================================================== # subroutines #================================================== sub by_date { my ($datea, $mina) = split /:/, $users{$a}; my ($dateb, $minb) = split /:/, $users{$b}; $dateb <=> $datea || $minb <=>$mina || $b cmp $a } sub grab_page { my $url = shift; my $page; $| = 1; print "Grabbing $url..." if($output_grab); my $ua = new LWP::UserAgent; my $res = $ua->request(GET "$url"); if($res->is_success) { print "done\n" if($output_grab); $page = $res->content; return $page; } else { return 0; } } sub parse_index { my $page = shift; my ($message, $title, $lastpage, $author); while($page =~ /]*>([^<]*)<\/a>(.*?)([^<]*)<\/span>/igms) { ($message, $title, $lastpage, $author) = ($1, $2, $3, $4); $message =~ s/.*m=(\d+)"/$1/; if($lastpage =~ /.*p=(\d+)"/){ $lastpage = $1; } else{ $lastpage = 1; } $index{$message}{"title"} = $title; $index{$message}{"author"} = $author; $index{$message}{"lastpage"} = $lastpage; } } sub parse_topic { my ($url, $page, $title) = @_; my $user; my $post_date; my $post_url; while($page =~ /]*>([^<]*)<\/a>.*?.*?posted.*?]*?>(\w+ \d+, \d+ \d\d:\d\d)<\/noscript>/igms) { $user = $1; $post_url = $2; $post_date = $3; $post_date = ParseDate($post_date); unless(defined $users{$user} && $users{$user} gt $post_date) { $users{$user} = $post_date; $users{$user}{'url'} = $post_url; $users{$user}{'title'} = $title; } } }