#!/usr/bin/perl -w # # "Rex" my $version = '1.2.1'; # An Australian TV Guide Grabber (a.k.a. tv_grab_au) # by Max Barry # http://www.maxbarry.com # # Based on the long-serving but currently defunct NMSN Australian TV grabber # by Michael 'Immir' Smith # # Options: --configure configure the grabber # --config-file use specified config file # --cache-file use specified cache file # --show-channels show subscribed channels & exit # --show-config show configuration details & exit # --days days to grab # --offset skip first n days # --output xml output file # --ignore-cache destroy & rebuild cache # --dump-cache print cache & exit # --stats print stats every n secs (0=off) # --test don't write output or cache # --warper use webwarper.net anonymizer # # A current version of this script, plus a README file, might be in here: # http://www.whuffy.com/tv_grab_au/ # # To install Perl dependencies (like XMLTV.pm), you generally need to # do this (as root): perl -MCPAN -e 'install ' # E.g. perl -MCPAN -e 'install XMLTV::Ask' # # Changelog: # 0.1.0 : Let there be code # 0.2.0 : Better caching (fewer HTTP connections) # 0.3.0 : Aborted attempt to use proxy caches # 0.4.0 : Reverted to non-proxy version; switched datasources # 1.0.0 : --configure works; code released # 1.0.1 : Bugfix : --config-file now works # 1.1.0 : Feature: --stats option # 1.1.1 : Bugfix : download Pay TV show details properly # 1.1.2 : Bugfix : replaced non-working --static with --test; # put sport and free-to-air movies in correct category # for MythTV # 1.1.3 : Bugfix : don't die on a failed download, just report # 1.2.0 : Feature: --cache-file option # 1.2.1 : Bugfix : better explanation for config-file failure use strict; use Getopt::Long; use HTTP::Request::Common; use LWP::UserAgent; use Date::Manip; use File::Path; use File::Basename; use Data::Dumper; use HTML::TreeBuilder; use Storable; use XMLTV; use XMLTV::Ask; use XMLTV::Config_file; # --------------------------------------------------------------------------- # --- Global Variables my $progname = "Rex"; my $lang = "en"; my $output_dir = $ENV{HOME} . "/." . lc($progname); # That is: ~/.rex my $cache_file = "$output_dir/cache.dat"; my ($count_dl, $count_detail, $count_bad, $count_adjust, $count_cache) = (0) x 5; my $DATASOURCE = "http://www.yourtv.com.au"; my $DATASOURCE_SETUP = "$DATASOURCE/profile/index.cfm?action=saveRegions"; my $DATASOURCE_GUIDE = "$DATASOURCE/guide/index.cfm"; my $DATASOURCE_GUIDE_TODAY = "$DATASOURCE/guide/index.cfm?action=restofday"; my $DATASOURCE_DETAIL = "$DATASOURCE/guide/index.cfm"; my $WW = "http://webwarper.net/ww/"; my $runtime = time(); my $laststats = $runtime; my $firstfetch; my $debug = 0; my $opt; my $channels = {}; my %chanid; my @channellist; my $cached; my $ua; my $sid; my @showpids; my %shows; # --------------------------------------------------------------------------- # --- Setup print "$progname $version\n"; Getopt::Long::Configure(qw/pass_through/); get_initial_command_line_options(); my $config_file = XMLTV::Config_file::filename($opt->{configfile}, lc($progname), not $debug); read_config_file(); get_remaining_command_line_options(); configure() if ($opt->{configure}); set_defaults(); if ($opt->{test}) { print "Test mode: won't write XML or cache.\n"; $opt->{output} = undef; } build_channel_map(); if ($opt->{show_channels}) { show_channels(); exit 0; } Date_Init("TZ=$opt->{TZ}"); # explicitly Set Timezone if ($debug or $opt->{show_config}) { show_config(); exit 0 if ($opt->{show_config}); } restore_cache(); if ($opt->{dump_cache}) { dump_cache(); exit 0; } # --------------------------------------------------------------------------- # --- Go! get_guide_data(); get_details_data(); save_cache(); write_xml(); print stats(1); exit 0; # --------------------------------------------------------------------------- # --- Stop! # # Subs! # sub get_guide_data { print "Grabbing data for days " . ($opt->{offset} + 1) . " - $opt->{days} into $opt->{output}.\n"; refresh_ua() unless ($ua); my ($guidedata, $date); for my $day ($opt->{offset} .. $opt->{days} - 1) { $date = Ymd(DateCalc("today", "+ $day days")); print "Day $day.\n" if ($debug); if (!$day) { # Special bandwidth-saving URL for day 0 $guidedata = get_page($DATASOURCE_GUIDE_TODAY); parse_guide(Encode::decode_utf8($guidedata)); } else { # Need to grab day in 6-hour chunks. for (1 .. 4) { $guidedata = post_page($DATASOURCE_GUIDE, [ 'action' => "sessionTimes", 'region_id' => $opt->{regionid}, 'date' => $date, 'period' => $_, 'submit' => 'submit' ]); parse_guide(Encode::decode_utf8($guidedata)); } } } # We now have a list of desired show IDs in @showpids. print "Shows found: ". @showpids . ".\n"; } # # This sub fills up %shows with details, either from the cache or # from the web. # sub get_details_data { my ($show, $detailsdata, $dcount); $firstfetch = time(); $dcount = 0; foreach my $pid (@showpids) { # See if we've got this in the cache. if ($cached->{$pid}) { print "Cached: " . $cached->{$pid}->{title}[0][0] . ".\n" if ($debug); $shows{$pid} = $cached->{$pid}; $count_cache++; } else { $dcount++; refresh_ua() if ($dcount % 20 == 0); # don't wait for error page $shows{$pid} = download_show($pid); $count_detail++; sleep int(rand(10)); } if ($opt->{stats} and time() - $laststats >= $opt->{stats}) { print stats(); } } } sub download_show { my ($pid, $recurse_count) = @_; $recurse_count ||= 0; return undef if ($recurse_count > 3); print "Downloading # $pid.\n" if ($debug); my $detailsdata = get_page($DATASOURCE_DETAIL . '?action=session_info&event_id=' . $pid . '&sid=' . $sid . '&loc=grid'); my $result = parse_details(Encode::decode_utf8($detailsdata)); if ($result) { return $result; } else { print "Download failed.\n" if ($debug); $count_bad++; refresh_ua(); return download_show($pid, $recurse_count+1); } } sub save_cache { return if ($opt->{test}); print "Saving cache.\n"; Storable::store(\%shows, $cache_file); } sub write_xml { return if ($opt->{test}); my %writer_args = ( encoding => 'ISO-8859-1' ); print "Writing XML.\n"; if ($opt->{output}) { my $fh = new IO::File(">" . $opt->{output}) or die "can't open " . $opt->{output} . ": $!"; $writer_args{OUTPUT} = $fh; } my $writer = new XMLTV::Writer(%writer_args); $writer->start ( { 'source-info-url' => $DATASOURCE, 'source-info-name' => "Datasource Name", 'generator-info-name' => "XMLTV - $progname v$version"} ); for my $channel (sort keys %$channels) { my $chanid = $chanid{lc $channel}; $writer->write_channel( { 'display-name' => [[$channel, $lang]], 'id' => $chanid } ); } foreach my $pid (keys %shows) { print "- " . $shows{$pid}->{'title'}[0][0] . "\n" if ($debug); $writer->write_programme($shows{$pid}); } $writer->end(); } sub refresh_ua { print "Refreshing UA.\n" if ($debug); if ($ua) { print "Sleeping...\n" if ($debug); sleep 5 + int(rand(20)); } my $agent = ( 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)', 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.0.4) Gecko/20060508 Firefox/1.5.0.4', 'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.7.6) Gecko/20050512 Firefox', 'Opera/9.00 (Windows NT 5.1; U; en)', 'Mozilla/5.0 (Macintosh; U; PPC Mac OS X; en-us) AppleWebKit/412 (KHTML, like Gecko) Safari/412' )[int(rand(5))]; $ua = LWP::UserAgent->new ('timeout' => 30, 'keep_alive' => 1, 'agent' => $agent); $ua->env_proxy; $ua->cookie_jar({}); # Set initial cookie get_page($DATASOURCE, 1); # Set region/service cookie post_page($DATASOURCE_SETUP, [ 'fta_region_id' => $opt->{'regionid'}, 'pay_region_id' => $opt->{'payregionid'} ], 1 ); $ua->cookie_jar()->scan(\&refresh_sid); } sub refresh_sid { my ($version, $key, $val) = @_; $sid = $val if ($key eq 'CFID'); } sub restore_cache { unless ($opt->{ignore_cache}) { if (-r $cache_file) { $cached = Storable::retrieve($cache_file); } else { print "Unable to read cache file: $cache_file.\n"; } } if ($cached) { print "Retrieved " . keys(%$cached) . " cached items from file.\n"; } else { $cached = { }; print "Not using cache.\n"; } } sub dump_cache { print "Cache: " . Dumper($cached) . "\n"; } sub stats { my $finished = shift; my $t = time() - $runtime; my $ret = "$progname $version " . ($finished ? "finished" : "in progress") . ":\n"; $ret .= sprintf( " %d shows grabbed\n" . " %d downloads, including %d detail pages\n" . " %d cache hits, %d failed detail pages, %d stop times adjusted\n", scalar(keys %shows), $count_dl, $count_detail, $count_cache, $count_bad, $count_adjust); $ret .= " Time elapsed: " . timestats($t) . "\n"; unless ($finished or !$count_detail) { $t = ((((time() - $firstfetch) * (@showpids - $count_cache)) / $count_detail)) - $t; $ret .= " Estimated time remaining: " . timestats($t) . "\n"; } $laststats = time(); return $ret; } sub timestats { my $t = shift; my $ret = ''; if ($t >= 3600) { $ret .= sprintf("%d hr ", $t / 3600); $t = $t % 3600; } $ret .= sprintf("%d min %d sec", $t / 60, $t % 60); return $ret; } sub get_page { my ($url, $ignore_failure) = @_; my $request = GET $url; return fetch_page($request, $ignore_failure); } sub post_page { my ($url, $headers, $ignore_failure) = @_; my $request = POST $url, $headers; return fetch_page($request, $ignore_failure); } sub fetch_page { my ($request, $ignore_failure) = @_; $request->uri() =~ s/^http:\/\//$WW/ if $opt->{warper}; print "Fetching: " . $request->as_string() . "\n" if ($debug); my $response; for my $c (1..2) { print "Attempt #$c.\n" if ($debug); $response = $ua->request($request); last if ($response->is_success() or $ignore_failure); sleep 5; } unless ($response->is_success() or $ignore_failure) { print "ERROR! Failed to retrieve page: " . $request->uri() . ".\n"; } if ($debug and (my $r = $response)->previous) { print "GET_CONTENT_BASE redirection backtrace:\n"; while ($r) { print " ", $r->base, "\n"; $r = $r->previous } } $count_dl++; my $page = $response->content(); $page =~ s/ / /g; return $page; } sub parse_guide { my $guidedata = shift; print "Parsing guide page.\n" if ($debug); my $tree = HTML::TreeBuilder->new_from_content($guidedata); my $curchan = ''; my ($pid, $block, $line, $link, $title); foreach my $tag ($tree->look_down('_tag' => 'td', 'class' => 'venue')) { next if ($curchan eq $tag->as_text()); # Ignore repeated Station name $curchan = $tag->as_text(); $curchan =~ s/\(.*\)//; if ($opt->{'configure'}) { push (@channellist, $curchan); next; } if ($channels->{$curchan}) { print "Channel: $curchan.\n" if ($debug); $block = $tag->parent(); foreach $line ($block->look_down('_tag' => 'td', 'class' => undef)) { $line = $line->extract_links('a'); foreach $link (@$line) { unless ($link->[0] =~ /session_info\('(\d+)'/) { print "Parsing error: No pid found in block!\n"; next; } $pid = $1; $title = $link->[1]->as_text(); if (grep (/$pid/, @showpids)) { # don't count same show twice print "Duplicate: $title ($pid).\n" if ($debug); } else { print "New: $title ($pid).\n" if ($debug); push (@showpids, $pid); } } } } else { print "Ignoring unsubscribed channel $curchan.\n" if ($debug); } } } sub parse_details { my $detailsdata = shift; my ($show, $str, @rows, $block, $start, $stop, $date, @extra, @items); my $tree = HTML::TreeBuilder->new_from_content($detailsdata); $block = $tree->find('h1'); return undef unless ($block); # site is probably sending that block page $show->{'title'} = [[ strip_whitespace($block->as_text()), $lang ]]; # print "Title: " . $show->{'title'}[0][0] . "\n"; $str = $tree->find('h2'); if ($str) { $show->{'sub-title'} = [[ strip_whitespace($str->as_text()), $lang ]]; } $block = $tree->find('h3'); @rows = $block->look_down('_tag' => 'div'); $date = $rows[1]->as_text(); $rows[0]->as_HTML() =~ /([\d\.]+[ap]m)\s+-\s+([\d\.]+[ap]m)/; ($start, $stop) = ($1, $2); $start =~ s/\./:/; $stop =~ s/\./:/; $show->{'start'} = expand_date("$start $date"); $show->{'stop'} = expand_date("$stop $date"); if (Date_Cmp($show->{'start'}, $show->{'stop'}) == 1) { $show->{'stop'} = expand_date(DateCalc($show->{'stop'}, "+ 1 day")); $count_adjust++; print "Adjusted STOP time.\n" if ($debug); } $show->{'channel'} = $chanid{lc($rows[0]->find('span')->as_text())}; $block = $tree->find('_tag' => 'hr', 'noshade')->right(); if ($block->as_text()) { $show->{'desc'} = [[ strip_whitespace($block->as_text()), $lang ]]; $block = $block->right()->right(); } else { $block = $block->right(); } foreach my $tag ($block->look_down('_tag' => 'tr')) { $str = $tag->as_text(); if ($str =~ /Genre:(.+)/) { $str = strip_whitespace($1); if ($str eq 'N/A') { @extra = ( $str ); } else { @extra = split(/\//, $str); } $str = $tree->find('h5'); if ($str and $str->as_text() =~ /movie/i) { unshift (@extra, 'movie'); } foreach (@extra) { $_ = 'sports' if ($_ eq 'Sport'); $_ = [ $_, $lang ]; } $show->{'category'} = [ @extra ]; } elsif ($str =~ /Rating:(.+)/) { $str = strip_whitespace($1); if ($str =~ /(.*?)\s*\[(.+)\]/) { @extra = split(/, /, $2); foreach (@extra) { $_ = [ $_, 'advisory', undef ]; } $show->{'rating'} = [[ $1, 'CTVA', undef], @extra ]; } else { $show->{'rating'} = [[ $1, "CTVA", undef]]; } } elsif ($str =~ /Cast: (.+)/) { $show->{'credits'}{'actor'} = [ split(/, /, strip_whitespace($1)) ]; } elsif ($str =~ /Year:(\d+)/) { # $show->{'date'} = strip_whitespace($1); } elsif ($str =~ /Other:(.+)/) { next unless (strip_whitespace($1)); $str = $tag->as_HTML(); @extra = split(/
/, $tag->as_HTML()); my @to_add; foreach my $bit (@extra) { $bit =~ s/<.*>//; $bit = strip_whitespace($bit); next unless ($bit); push (@to_add, $bit); } foreach (@to_add) { if ($show->{'desc'}) { $show->{'desc'}[0][0] .= " $_."; } else { $show->{'desc'} = [[ "$_.", $lang ]]; } } } } print Dumper($show) if ($debug); return $show; } sub read_config_file { print "Reading configuration file: $config_file\n"; if (-r $config_file) { local (@ARGV, $/) = ($config_file); no warnings 'all'; eval <>; if ($@ and !$opt->{configure}) { my $str = "\nError in configuration file!\n"; if ($opt->{configfile}) { $str .= "$progname was sent the option " . "\"--config-file $config_file\",\n" . "but this file failed to parse correctly.\n" . "You may be pointing Rex at the wrong configuration file.\n" . "Try (re-)configuring $progname with this option " . "(i.e. --configure --config-file $config_file).\n"; } die "$str\nError details:\n$@"; } } else { print "\nNo configuration file found.\n$progname is not configured!\n\n"; $opt->{'configure'} = 1; } } sub get_initial_command_line_options { GetOptions( 'config-file=s' => \$opt->{configfile}, 'debug' => \$debug); } sub get_remaining_command_line_options { GetOptions( 'days=i' => \$opt->{days}, 'offset=i' => \$opt->{offset}, 'show-channels' => \$opt->{show_channels}, 'output=s' => \$opt->{output}, 'configure' => \$opt->{configure}, 'ignore-cache' => \$opt->{ignore_cache}, 'dump-cache' => \$opt->{dump_cache}, 'test' => \$opt->{test}, 'show-config' => \$opt->{show_config}, 'warper' => \$opt->{warper}, 'cache-file=s' => \$cache_file, 'stats=i' => \$opt->{stats}); } sub show_config { my $short = shift; print "\nConfiguration\n". "-------------\n"; unless ($short) { print "Debug mode : " . is_set($debug) . "\n" . "Test mode : " . is_set($opt->{test}) . "\n" . "Webwarping : " . is_set($opt->{warper}) . "\n"; } print "Config file: $config_file\n" . "Cache file : $cache_file\n" . "Output file: " . ($opt->{output} ? $opt->{output} : "None") . "\n" . "TimeZone : $opt->{TZ}\n" . "Region ID : $opt->{regionid}\n" . "Pay Region : " . ($opt->{payregionid} ? $opt->{payregionid} : 0) . "\n" . "Days wanted: $opt->{days} (offset: $opt->{offset})\n"; show_channels(); print "\n"; } sub is_set { my $arg = shift; return $arg ? "Yes" : "No"; } sub show_channels { print "Subscribed channels:\n"; print " $_ -> $channels->{$_}\n" for sort keys %$channels; } sub build_channel_map { # --- extract sorted subscribed channel list from config-file hash; # also compute canonicalised lowercased channel to xmltvid hash %chanid = map { lc $_, $channels->{$_} } keys %$channels; } sub set_defaults { my $defaults = { 'days' => 5, 'offset' => 0, 'TZ' => '+1000', 'output' => "$output_dir/guide.xml" }; foreach (keys %$defaults) { unless ($opt->{$_}) { $opt->{$_} = $defaults->{$_}; } } $opt->{'days'} = 7 if ($opt->{'days'} > 7); } sub expand_date { my $t = shift; my $ret = ParseDate($t); # --- append timezone and strip colons ($ret .= " $opt->{TZ}") =~ tr/://d; return $ret; } sub make_cache_id { my $show = shift; return join(':', $show->{'start'}, $show->{'channel'}, $show->{'title'}[0][0]); } sub configure { my $REGIONS = { "ACT - ACT" => 126, "NSW - Sydney" => 73, "NSW - Newcastle" => 184, "NSW - Central Coast" => 66, "NSW - Griffith" => 67, "NSW - Broken Hill" => 63, "NSW - Northern NSW" => 69, "NSW - Southern NSW" => 71, "NSW - Remote and Central" => 106, "NT - Darwin" => 74, "NT - Remote and Central" => 108, "QLD - Brisbane" => 75, "QLD - Gold Coast" => 78, "QLD - Regional" => 79, "QLD - Remote and Central" => 114, "SA - Adelaide" => 81, "SA - Renmark" => 82, "SA - Riverland" => 83, "SA - South East SA" => 85, "SA - Spencer Gulf" => 86, "SA - Remote and Central" => 107, "TAS - Tasmania" => 88, "VIC - Melbourne" => 94, "VIC - Geelong" => 93, "VIC - Eastern Victoria" => 90, "VIC - Mildura/Sunraysia" => 95, "VIC - Western Victoria" => 98, "WA - Perth" => 101, "WA - Regional" => 102 }; my $PAYTV = { "Free-to-air only (no Pay TV)" => 0, "FOXTEL Digital" => 168, "FOXTEL Analogue Cable" => 123, "FOXTEL Analogue Satellite" => 129, "Optus" => 124, "Optus feat. FOXTEL Digital" => 192, "AUSTAR" => 125, "AUSTAR Digital" => 169, "TransACT" => 128, "World Media International" => 170, "Neighborhood Cable" => 171, "SELECTV - English" => 193 }; print "Configuring...\n"; $opt->{TZ} = ask("Enter your timezone (e.g. \"+1000\"):"); print "Select your region code:\n"; foreach (sort keys %$REGIONS) { printf(" (%3d) %s\n", $REGIONS->{$_}, $_); } $opt->{regionid} = ask_choice("Enter region code:", "94", values %$REGIONS); print "\nSelect a Pay TV option:\n"; foreach (sort keys %$PAYTV) { printf(" (%3d) %s\n", $PAYTV->{$_}, $_); } $opt->{payregionid} = ask_choice("Enter Pay TV code:", 0, values %$PAYTV); print "\nFetching channel information...\n\n"; refresh_ua() unless ($ua); parse_guide(Encode::decode_utf8(get_page($DATASOURCE_GUIDE_TODAY))); print "For each channel you want guide data for, enter an XMLTV id\n" . "of your choice (e.g. \"seven.free.au\"). If you don't need\n" . "guide data for this channel, just press Enter.\n\n" . "PLEASE NOTE: The more channels you subscribe to, the slower\n". "this program operates and the more load it generates. Please\n". "don't subscribe to unneeded channels.\n\nChannels:\n"; $channels = {}; my $line; foreach (@channellist) { $line = ask(" \"$_\"? "); $channels->{$_} = $line if ($line); } $opt->{days} = ask("\nHow many days guide data do you want (default=5)? "); set_defaults(); print "\nPlease check the following configuration:\n\n"; show_config(1); unless(ask_boolean("Create configuration file?")) { print "Aborting configuration.\n"; exit 0; } delete $opt->{'configure'}; delete $opt->{'output'}; foreach (keys %$opt) { delete $opt->{$_} unless ($opt->{$_}); } # --- open config file and write the configuration -d dirname($config_file) or mkdir dirname($config_file) or die "cannot create directory for $config_file: $!"; -d $output_dir or mkdir $output_dir or die "cannot create directory $output_dir: $!"; open(CONF, ">$config_file") or die "cannot write to $config_file: $!"; print CONF Data::Dumper->Dump([$opt, $channels], ["opt", "channels"]); close CONF; print "Finished configuring.\n\n"; unless (ask_boolean("Grab data now?")) { exit 0; } } sub Ymd { UnixDate($_[0], "%Y-%m-%d") or die "problem in Ymd($_[0])" } sub strip_whitespace { my $str = shift; $str =~ s/^\s*(.*?)\s*$/$1/; return $str; }