#!/usr/bin/perl -w

######################################################################
# arsforum_user_grab -- by greenfly <greenfly@greenfly.org>
#
# This program will parse a specified Ars Forum Index page,
# then note all accounts which have posted in the most 
# recent threads on that index page.
#
# The script will output the usernames in order of most 
# recent post, and then print a filter that is compatible 
# with privoxy to filter those names
#
# usage: arsforum_user_grab [<forum name>]
# 
######################################################################

use LWP::UserAgent;
use HTTP::Request::Common;
use HTML::Parser;
use Date::Manip qw(ParseDate UnixDate);

my $default = 'Battlefront';
our $wait = 0;	# number of seconds to wait before loading the next page (so as not to spam the forum)

our $output_filter   = 0;	# print the privoxy filter
our $output_grab     = 0;	# print the forum scraping feedback
our $output_post_url = 1;	# print the URL of the last post for each user
our $output_html     = 1;	# print output in HTML
our $max_count	     = 100;	# number of users to output, 0 means all

our %fora = (
	 'Audio/Visual Club' 			=> "67909965",
	 'Case and Cooling Fetish' 		=> "77909585",
	 'CPU & Motherboard Technologia' 	=> "77909774",
	 'Mobile Computing Outpost' 		=> "579009962631",
	 'Networking Matrix' 			=> "469092836",
	 'Other Hardware' 			=> "24609792",
	 'Agora Classifieds' 			=> "57909216",
	 'Battlefront' 				=> "48409524",
	 'Microsoft OS & Software Colloquium' 	=> "99609816",
	 'Linux Kung Fu' 			=> "96509133",
	 'NT, Win2K, & XP Technical Mojo' 	=> "12009443",
	 'Distributed Computing Arcana' 	=> "122097561",
	 'Macintoshian Achaia' 			=> "8300945231",
	 "Programmer's Symposium" 		=> "6330927813",
	 'Gaming, Extra Strength Caplets' 	=> "39309975",
	 'The Lounge' 				=> "34709834",
	 'The Soap Box' 			=> "28609695",
	 'The Boardroom' 			=> "599009962631",
	 'Ars Technica News & Discussion' 	=> "174096756",
	 'The Velvet Room' 			=> "8390901411",
	 'OpenForum Feedback & Suggestions' 	=> "51009562",
	 'Ars PDF Technical Library'	 	=> "5850957912",
	 'Subscription Support & Service' 	=> "6490940022",
	 'Ars Emporium Customer Service' 	=> "6550932203",
	 "The Moderators' Quorum" 		=> "4680902032",
	 "The Writers' Block" 			=> "6220903374",
	 "Dungeon Masters' back room" 		=> "103007483631",
);

our $forum;
if(defined $ARGV[0])
{
   $forum = $fora{$ARGV[0]};
   print "Get $ARGV[0] Index\n   " if($output_grab); 
}
else
{
   $forum = $fora{$default};
   print "Get $default Index\n   " if($output_grab);
}


our $base = "http://episteme.arstechnica.com/eve/ubb.x?";
our $site = "50009562";
our $index = $base . "a=frm\&s=$site\&f=$forum";
our %index;
our %users;
my @userlist;
my $url;


# grab the Forum's index page
$page = grab_page($index) or die "\nCan't open $index, nothing to parse!\n"; 



# then parse it
parse_index($page);

# parse the last page of each thread in the index for user names
foreach(keys %index)
{
      print "\"$index{$_}{'title'}\" - Started by: $index{$_}{'author'} - Page $index{$_}{'lastpage'}\n   " if($output_grab);
      $url = $base . "a=tpc\&s=$site\&f=$forum\&m=$_\&p=$index{$_}{'lastpage'}";
      $page = grab_page($url);
      parse_topic($url, $page, $index{$_}{'title'});
      sleep $wait;
}


# at this point all of the usernames have been gathered.
# Now output them sorted by date
$count = 1;
foreach(sort by_date keys %users)
{
   last if(($max_count > 0) && ($count > $max_count));
   if($output_html)
   {
      print "$count: $users{$_}\t$_\t--\t<a href=\"$users{$_}{'url'}\">$users{$_}{'title'}</a><br/>\n";
   }
   else
   {
      print "$count: $users{$_}\t$_";
      if($output_post_url){ print "\t$users{$_}{'url'}\n";}
      else{ print "\n"; }
   }

   push(@userlist, $_);
   $count++;
}

if($output_filter)
{
#Now output the privoxy filter
   print "-" x 20 , "start default.filter" , "-" x 20, "\n";

   print 's/<table class."ev_msg_rowcolor." [^>]+>[^<]+<tr>[^<]+<td[^>]+><a name[^>]+><\/a><span[^>]+><[^>]+>(';
	 print join "|", @userlist;
	 print ')<\/a><\/span>/<table style="display:none;"><tr><td>/gsU' . "\n";

   print "-" x 20 , "end default.filter" , "-" x 23, "\n";
}



#==================================================
# subroutines
#==================================================


sub by_date
{
   my ($datea, $mina) = split /:/, $users{$a};
   my ($dateb, $minb) = split /:/, $users{$b};
   $dateb <=> $datea
   ||
   $minb <=>$mina
   ||
   $b cmp $a
}


sub grab_page
{
   my $url = shift;
   my $page;

   $| = 1;
   print "Grabbing $url..." if($output_grab);
   my $ua = new LWP::UserAgent;
   my $res = $ua->request(GET "$url");
   if($res->is_success)
   {
      print "done\n" if($output_grab);
      $page = $res->content;
      return $page;
   }
   else
   {
      return 0;
   }

}

sub parse_index
{
   my $page = shift;
   my ($message, $title, $lastpage, $author);

   while($page =~ /<td class="ev_forumlist_td_topicicon".*?<td.*?class="ev_forum_td_title".*?<a href="([^"]*")[^>]*>([^<]*)<\/a>(.*?)<td.*?class="ev_forum_td_misc"><span class="ev_text_normal">([^<]*)<\/span>/igms)
   {
      ($message, $title, $lastpage, $author) = ($1, $2, $3, $4);
      $message =~ s/.*m=(\d+)"/$1/;
      if($lastpage =~ /.*p=(\d+)"/){ $lastpage = $1; }
      else{ $lastpage = 1; }
      $index{$message}{"title"} = $title;
      $index{$message}{"author"} = $author;
      $index{$message}{"lastpage"} = $lastpage;
   }
}

sub parse_topic
{
   my ($url, $page, $title) = @_;
   my $user;
   my $post_date;
   my $post_url;

   while($page =~ /<td.*?class="ev_msg_userinfo".*?<a class="ev_text_normal"[^>]*>([^<]*)<\/a>.*?<td align=\"right\" valign=\"top\"><a href=\"(.*?)\">.*?posted.*?<noscript[^>]*?>(\w+ \d+, \d+ \d\d:\d\d)<\/noscript>/igms)
   {
      $user = $1;
      $post_url = $2;
      $post_date = $3;
      $post_date = ParseDate($post_date);
      unless(defined $users{$user} && $users{$user} gt $post_date)
      {
	 $users{$user} = $post_date;
	 $users{$user}{'url'} = $post_url;
	 $users{$user}{'title'} = $title;
      }
   }
}

