#!/usr/bin/perl # # au tv guide grabber - # written by ltd # * uses yahoo7 widget for ABC/7/9/10/SBS # * uses yahoo7 portal for ABC2/SBSNEWS/Foxtel/Optus data with caching to reduce query load on server # # loosely based on Michael 'Immir' Smith's excellent 9MSN tv_grab_au # # IMPORTANT NOTE: # this does NOT use any config file # all region/channel settings are hardcoded below - please set them! # $version = "1.30_04aug06"; use LWP::UserAgent; use Time::HiRes qw(gettimeofday tv_interval); use XMLTV; use XML::DOM; use XML::DOM::NodeList; use POSIX qw(strftime mktime); use Getopt::Long; use HTML::TreeBuilder; use Data::Dumper; # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # start of settings # set region appropriate for where we want to get data: # VIC: Melbourne(94), Geelong(93), Mildura/Sunraysia(95), # Eastern Victoria(90), Western Victoria(98) # NSW: Sydney(73), Broken Hill(63), Central Coast(66), Griffith(67), # Northern NSW(69), Southern NSW(71), Remote and Central(106) # QLD: Brisbane(75), Gold Coast(78), Regional QLD(79), Remote and Central(114) # WA: Perth(101), Regional WA(102) # SA: Adelaide(81), Renmark(82), Riverland(83), South East South Australia(85), # Spencer Gulf(86), Remote and Central(107) # NT: Darwin(74), Remote and Central(108) # ACT: ACT(73) # TAS: Tasmania(88) $region=94; # select channel mappings ... # 1. widget mappings: # the channel names ABC, Seven, Nine, TEN, SBS are hardcoded in xml received from yahoo7 widget data # the mappings here are to map to whatever xmltv tags you are using or leave blank if you don't want # data for that channel $y7w_channel_id{"ABC"} = "abcvic.free.au"; $y7w_channel_id{"Seven"} = "channelsevenmelbourne.free.au"; $y7w_channel_id{"Nine"} = "channelninemelbourne.free.au"; $y7w_channel_id{"TEN"} = "networktenmelbourne.free.au"; $y7w_channel_id{"SBS"} = "sbsmelbourne.free.au"; # 2. portal mappings # leave blank if you don't want data for any of these channels $y7p_channel_id{"283793,ABC2"} = "abc2.abc.gov.au"; $y7p_channel_id{"272787,SBS NEWS"} = "news.sbs.com.au"; # $y7p_channel_id{"251261,FOX W"} = ""; # Foxtel # $y7p_channel_id{"267072,Fox Footy Channel"} = ""; # Foxtel # $y7p_channel_id{"251349,Hallmark"} = ""; # Foxtel # $y7p_channel_id{"251262,FOX8"} = ""; # Foxtel # $y7p_channel_id{"251351,Fox Classics"} = ""; # Foxtel # $y7p_channel_id{"251264,The Lifestyle Channel"} = ""; # Foxtel # $y7p_channel_id{"251352,National Geographic"} = ""; # Foxtel # add any more you wish, visit http://au.tv.yahoo.com/results.html and use # 'vn' (venue) number from channel column... # end of settings # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # some initial cruft # $script_start_time = [gettimeofday]; # lets make sure we look exactly like the yahoo widget engine... my $ua; BEGIN { $ua = LWP::UserAgent->new( 'timeout' => 30, 'keep_alive' => 1, 'agent' => 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-us)' ); $ua->env_proxy; # $ua->cookie_jar({}); $| = 1; } # # parse command line # my $opt_days = 7; # default my $opt_outputfile = ""; # /var/local/tv_grab_au/guide.xml"; # default my $opt_cache_file = "/var/local/tv_grab_au/cache.dat"; # default my $opt_configfile = ""; # ignored my $opt_fast = 0; my $opt_warper = 0; my $opt_obfuscate = 0; my $opt_no_cache = 0; my $opt_no_detail = 0; my $opt_help = 0; my $opt_dont_fetch_widget = 0; my $opt_dont_fetch_portal = 0; my $opt_dont_retry = 0; my $debug = 1; my $lang = "en"; GetOptions( 'days=i' => \$opt_days, 'output=s' => \$opt_outputfile, 'config-file=s' => \$opt_configfile, 'cache-file=s' => \$opt_cache_file, 'no-cache' => \$opt_no_cache, 'fast' => \$opt_fast, 'debug+' => \$debug, 'warper' => \$opt_warper, 'no-widget-data' => \$opt_dont_fetch_widget, 'no-portal-data' => \$opt_dont_fetch_portal, 'lang=s' => \$lang, 'obfuscate' => \$opt_obfuscate, 'no-detail' => \$opt_no_detail, 'no-retry' => \$opt_dont_retry, 'help' => \$opt_help, 'verbose' => \$opt_help, 'v' => \$opt_help); $random_sleep = ($opt_fast ? 0 : 1); &help if ($opt_help); # # go go go! # ($starttime,$endtime) = &calc_dates(time,$opt_days); &log("going to grab $opt_days days data into $opt_outputfile ..."); &read_cache if ($opt_no_cache == 0); &get_y7w_data($starttime,$endtime,$region) if ($opt_dont_fetch_widget == 0); &get_y7p_data($starttime,$endtime,$region) if ($opt_dont_fetch_portal == 0); &write_cache if ($opt_no_cache == 0); &write_data; &print_stats; exit(0); ###################################################################################################### # help sub help { print<new(GET => $url); $request->header('Accept-Encoding' => 'gzip'); if ($opt_obfuscate) { my $randomaddr = sprintf "203.%d.%d.%d",rand(255),rand(255),(rand(254)+1); $request->header('Via' => '1.0 proxy:81 (Squid/2.3.STABLE3)'); $request->header('X-Forwarded-For' => $randomaddr); } printf STDERR "[%d] fetching %s%s: %s\n",time,$status,($opt_obfuscate ? "[obfuscate]" : ""),$url if $debug; for (1..3) { $response = $ua->request($request); last if $response->is_success; $stats{http_failed_requests}++; $attempts++; &sleepy("attempt $attempts failed (url $url), sleeping for 15 seconds",15); } if (!($response->is_success)) { &log("aborting after $attempts attempts to fetch url $url"); printf STDERR "ERROR: could not open url %s in %d attempts\n",$url,$attempts; return undef; } $stats{bytes_fetched} += do {use bytes; length($response->content)}; $stats{http_successful_requests}++; if ($random_sleep) { $sleeptimer = int(rand(10)) + 15; # sleep anywhere from 15 to 25 seconds &sleepy("feeling sleepy for $sleeptimer seconds..",$sleeptimer); } if ($response->header('Content-Encoding') && $response->header('Content-Encoding') eq 'gzip') { $stats{compressed_pages} += do {use bytes; length($response->content)}; $response->content(Compress::Zlib::memGunzip($response->content)); } if ($response->header('Content-type') eq 'xapplication/ywe-octet-stream') { $stats{transformed_pages}++; $base = &transform_output(length($response->content), $response->content); } else { $base = $response->content; } return $base; } ###################################################################################################### sub log { local($entry) = @_; printf STDERR "[%d] %s\n",time,$entry if $debug; } ###################################################################################################### # Convert date time to 'yyyymmddhhmm +hhmm' format as expected by xmltv sub timestring { local($t) = @_; return strftime "%Y%m%d%H%M %z", localtime($t); } ###################################################################################################### sub print_stats { local($key); printf STDERR "completed in %0.2f seconds",tv_interval($script_start_time); foreach $key (sort keys %stats) { printf STDERR ", %d %s",$stats{$key},$key; } printf STDERR "\n"; } ###################################################################################################### # given yahoo7 xml data, parse it into 'shows' .. # parse it into $tv_guide->{$channel}->{data}->{$event_id}-> structures.. sub parse_data { local($data) = @_; my $parser = new XML::DOM::Parser; $tree = $parser->parse($data); my $channels = $tree->getElementsByTagName("venue"); for (my $i = 0; $i < $channels->getLength; $i++) { my $channel = $channels->item($i)->getAttributeNode("co_short")->getValue; # are we interested in this channel? if ($y7w_channel_id{$channel} eq "") { $stats{skipped_programmes}++; next; } # for this channel get every programme ('event') my $events = $channels->item($i)->getElementsByTagName("event"); for (my $j = 0; $j < $events->getLength; $j++) { my $event = $events->item($j); my $event_id = $event->getElementsByTagName("event_id")->item(0)->getFirstChild->getNodeValue; # mandatory fields my $event_start = $event->getElementsByTagName("event_date")->item(0)->getFirstChild->getNodeValue; my $event_end = $event->getElementsByTagName("end_date")->item(0)->getFirstChild->getNodeValue; my $event_title = $event_subtitle = $event_desc1 = $event_desc2 = $event_maincast = $event_year = undef; my $event_rating = $event_genre = $event_runtime = $event_repeatflag = $event_country = undef; # event_id actually isn't unique - so make it so $event_id .= $event_start . $event_end; # wrap these non-mandatory fields in an eval so if they don't exist the script doesn't barf out eval { $event_title = $event->getElementsByTagName("title")->item(0)->getFirstChild->getNodeValue; }; eval { $event_subtitle = $event->getElementsByTagName("subtitle")->item(0)->getFirstChild->getNodeValue; }; eval { $event_desc1 = $event->getElementsByTagName("description_1")->item(0)->getFirstChild->getNodeValue; }; eval { $event_desc2 = $event->getElementsByTagName("description_2")->item(0)->getFirstChild->getNodeValue; }; eval { $event_maincast = $event->getElementsByTagName("main_cast")->item(0)->getFirstChild->getNodeValue; }; eval { $event_year = $event->getElementsByTagName("year_released")->item(0)->getFirstChild->getNodeValue; }; eval { $event_rating = $event->getElementsByTagName("rating")->item(0)->getFirstChild->getNodeValue; }; eval { $event_genre = $event->getElementsByTagName("genre")->item(0)->getFirstChild->getNodeValue; }; eval { $event_runtime = $event->getElementsByTagName("running_time")->item(0)->getFirstChild->getNodeValue; }; eval { $event_repeatflag = $event->getElementsByTagName("repeat")->item(0)->getFirstChild->getNodeValue; }; eval { $event_country = $event->getElementsByTagName("country")->item(0)->getFirstChild->getNodeValue; }; # other fields we dont pick up but exist in source xml data include: # other_title, movie, live, premiere, final, captions, warnings, colour # language, genre_id, sub_category, director, highlight # ext_url, y7_url # add some additional info into description $event_desc1 .= "\n$event_desc2\n" if ($event_desc2 ne ""); $event_desc1 .= "\n\n"; $event_desc1 .= "(Repeat)\n" if ($event_repeatflag > 0); $event_desc1 .= "Rating: $event_rating\n" if ($event_rating ne ""); $event_desc1 .= "Year: $event_year\n" if ($event_year > 0); $event_desc1 .= "Credits/Cast: $event_maincast\n" if ($event_maincast ne ""); $event_desc1 .= "Genre/Category: $event_genre\n" if ($event_genre ne ""); $event_desc1 .= "Running Time: $event_runtime\n" if ($event_runtime > 0); $stats{programmes}++; $stats{duplicate_programmes}++ if ($tv_guide->{$channel}->{data}->{$event_id}); $tv_guide->{$channel}->{data}->{$event_id}->{'channel'} = $y7w_channel_id{$channel}; $tv_guide->{$channel}->{data}->{$event_id}->{'start'} = timestring($event_start); $tv_guide->{$channel}->{data}->{$event_id}->{'stop'} = timestring($event_end); $tv_guide->{$channel}->{data}->{$event_id}->{'title'} = [[ $event_title, $lang ]] if $event_title; $tv_guide->{$channel}->{data}->{$event_id}->{'sub-title'} = [[ $event_subtitle, $lang ]] if $event_subtitle; $tv_guide->{$channel}->{data}->{$event_id}->{'desc'} = [[ $event_desc1, $lang ]] if $event_desc1; $tv_guide->{$channel}->{data}->{$event_id}->{'category'} = [[ $event_genre, $lang ]] if $event_genre; $tv_guide->{$channel}->{data}->{$event_id}->{'country'} = [[ $event_country, $lang ]] if $event_country; } } $tree->dispose; } ###################################################################################################### # descend a structure and clean up various things, including stripping # leading/trailing spaces in strings, translations of html stuff etc # -- taken & modified from Michael 'Immir' Smith's excellent tv_grab_au my %amp; BEGIN { %amp = ( nbsp => ' ', qw{ amp & lt < gt > apos ' quot " } ) } sub cleanup { my $x = shift; if (ref $x eq "REF") { cleanup($_) } elsif (ref $x eq "HASH") { cleanup(\$_) for values %$x } elsif (ref $x eq "ARRAY") { cleanup(\$_) for @$x } elsif (defined $$x) { $$x =~ s/&(#(\d+)|(.*?));/ $2 ? chr($2) : $amp{$3}||' ' /eg; # $$x =~ s/[^\x20-\x7f]/ /g; $$x =~ s/(^\s+|\s+$)//g; } } ###################################################################################################### sub write_data { my %writer_args = ( encoding => 'ISO-8859-1' ); if ($opt_outputfile) { my $fh = new IO::File(">$opt_outputfile") or die "can't open $opt_outputfile: $!"; $writer_args{OUTPUT} = $fh; } my $writer = new XMLTV::Writer(%writer_args); $writer->start ( { 'source-info-url' => "yahoo7-au-parser", 'source-info-name' => "yahoo7-au-parser $version", 'generator-info-name' => "yahoo7-au-parser $version"} ); # write channels collected via y7 widget if ($opt_dont_fetch_widget == 0) { for my $channel (sort keys %y7w_channel_id) { $writer->write_channel( { 'display-name' => [[ $channel, $lang ]], 'id' => $y7w_channel_id{$channel} } ); } } # write channels collected via y7 portal if ($opt_dont_fetch_portal == 0) { for my $channel (sort keys %y7p_channel_id) { my ($venue_id,$channame) = split(/,/,$channel); $writer->write_channel( { 'display-name' => [[ $channame, $lang ]], 'id' => $y7p_channel_id{$channel} } ); } } # write programmes collected via y7 widget if ($opt_dont_fetch_widget == 0) { for my $channel (sort keys %y7w_channel_id) { for my $event_id (sort {$a <=> $b} keys %{($tv_guide->{$channel}->{data})}) { $show = $tv_guide->{$channel}->{data}->{$event_id}; &cleanup($show); $writer->write_programme($show); } } } # write programmes collected via y7 portal if ($opt_dont_fetch_portal == 0) { for my $channel (sort keys %y7p_channel_id) { for my $event_id (sort keys %{($tv_guide->{$channel}->{data})}) { $show = $tv_guide->{$channel}->{data}->{$event_id}; &cleanup($show); $writer->write_programme($show); } } } $writer->end(); } ###################################################################################################### # given yahoo7 portal data, parse it into 'shows' .. sub get_y7p_data { local($starttime,$endtime,$region) = @_; local($currtime, $try_to_add_detail, $want_to_add_detail); foreach my $channel (sort { $y7p_channel_id{$b} <=> $y7p_channel_id{$a} } keys %y7p_channel_id) { my ($venue,$channenname) = split(/,/,$channel); my $unprocessed_programmes = 0; my @unprocessed_progname = undef; my @unprocessed_starttime = undef my @unprocessed_url = undef; for ($currtime = $starttime; $currtime < $endtime; $currtime += 86400) { my $attempts = 1; my @timeattr = localtime($currtime); # 0=sec,1=min,2=hour,3=day,4=month,5=year,6=wday,7=yday,8=isdst my $status = sprintf "yahoo7 portal data: %s: %d of %d%s", $channenname, ((($currtime-$starttime)/86400)+1),(($endtime-$starttime)/86400), ($attempts > 1 ? " (attempt $attempts)" : ""); my $url = sprintf "http://au.tv.yahoo.com/venueresults.html?dt=%s&vn=%d", (strftime "%Y-%m-%d",localtime($currtime)), $venue; my $seen_programmes = 0; do { my $data = &get_url($url,$status); my $tree = HTML::TreeBuilder->new_from_content($data); my $seen_am = 0; for ($tree->look_down('_tag' => 'table', 'width' => '100%', 'border' => '1', 'bordercolor' => '#efefef')) { for ($_->look_down('_tag' => 'tr')) { my $found_time = 0; my $ignore_line = 0; foreach my $tree_td ($_->look_down('_tag' => 'td')) { if ($ignore_line == 0) { if ($found_time == 0) { # looking for time if ($tree_td->as_text() =~ /^(\d+):(\d+)(.)M$/) { $timeattr[2] = $1; # hour $timeattr[2] += 12 if ($3 eq "P"); # pm $timeattr[1] = $2; # min $found_time = mktime(@timeattr); # if entry is PM and we haven't seen any AM entries, ignore - its from the previous day $ignore_line = 1 if (($3 eq "P") && ($seen_am == 0)); $seen_am++ if ($3 eq "A"); } } else { # looking for name if ($_ = $tree_td->look_down('_tag' => 'a')) { my $programme = $_->as_text(); my $progurl = $_->attr('href'); $progurl = sprintf "http://au.tv.yahoo.com/%s",$1 if ($progurl =~ /^javascript:popup\(\"(.*)\"/); $unprocessed_progname[$unprocessed_programmes] = $programme; $unprocessed_starttime[$unprocessed_programmes] = $found_time; $unprocessed_url[$unprocessed_programmes] = $progurl; $unprocessed_programmes++; $seen_programmes++; } } } } } } if ($seen_programmes == 0) { printf STDERR "WARNING: failed to parse any programme data from %s - blocked/rate-limited/format-changed?\n",$url; $stats{failed_to_parse_portal_daily_page}++; $attempts++; } else { $stats{portal_daily_pages}++; } &sleepy("sleeping for 10 seconds",10); } until (($seen_programmes > 0) || ($attempts > 5)) } # have 'n' days of this channel unprocessed - process it! for (my $i = 0; $i < ($unprocessed_programmes-1); $i++) { $stats{programmes}++; my $starttime = $unprocessed_starttime[$i]; my $endtime = $unprocessed_starttime[$i+1]; $tv_guide->{$channel}->{data}->{$starttime}->{'channel'} = $y7p_channel_id{$channel}; $tv_guide->{$channel}->{data}->{$starttime}->{'start'} = timestring($starttime); $tv_guide->{$channel}->{data}->{$starttime}->{'stop'} = timestring($endtime); $tv_guide->{$channel}->{data}->{$starttime}->{'title'} = [[ $unprocessed_progname[$i], $lang ]]; # schedule a detailed data lookup for each programme if we need to # ideally we can use our cached data if it hasn't changed.. # search cache my $cache_key = sprintf "%d,%d,%s,%s", $starttime, $endtime, $$channel, $unprocessed_progname[$i]; if ($data_cache->{$cache_key}) { $stats{used_cached_data}++; &add_cached_data($channel,$starttime,$cache_key); &log("used cache data for programme $unprocessed_progname[$i] on channel $channel"); } else { &log("no cached data for programme $unprocessed_progname[$i] on channel $channel (".$data_cache->{$cache_key}.")"); if ($opt_no_detail == 0) { $try_to_add_detail{$unprocessed_url[$i]} = $cache_key; $want_to_add_detail++; } } } $unprocessed_programmes = 0; } # now that we have our primary data, try to add details if we can, parse through %try_to_add_detail foreach my $url (sort keys %try_to_add_detail) { &get_one_y7p_event($try_to_add_detail{$url},$url,"$want_to_add_detail remaining"); $want_to_add_detail--; &sleepy("sleeping for 16+ seconds",(16+rand(5))) if ($opt_fast == 0); } } ###################################################################################################### # given one yahoo7 portal page, parse it into $tv_guide->{$channel}->{data}->{$event_id}-> structures.. sub get_one_y7p_event { local($cache_key, $url, $status) = @_; my $seen_programme = 0; my ($starttime, $endtime, $channel, $progname) = split(/,/,$cache_key); do { my $data = &get_url($url,$status); my $tree = HTML::TreeBuilder->new_from_content($data); if (my $inner_tree = $tree->look_down('_tag' => 'div', 'class' => 'inner')) { my $event_title = undef, $event_subtitle = undef, $event_description = undef; my $event_genre = undef, $event_duration = undef; $event_title = $_->as_text() if ($_ = $inner_tree->look_down('_tag' => 'h1')); $event_subtitle = $_->as_text() if ($_ = $inner_tree->look_down('_tag' => 'h2')); foreach my $para ($inner_tree->look_down('_tag' => 'p')) { if ($para->as_HTML() =~ /

Genre:  (.*)$/) { $event_genre = $1; } else { $event_description .= $para->as_text(); $event_duration = ($1 * 60) if ($para->as_HTML() =~ /(\d+) mins/); } } if (($event_title) && ($event_duration)) { $stats{portal_detail_pages}++; $seen_programme++; $data_cache->{$cache_key}->{subtitle} = $event_subtitle if $event_subtitle; $data_cache->{$cache_key}->{desc} = $event_description if $event_description; $data_cache->{$cache_key}->{genre} = $event_genre if $event_genre; &add_cached_data($channel,$starttime,$cache_key); } } if ($seen_programme == 0) { printf STDERR "WARNING: failed to parse any programme data from '%s' - blocked/rate-limited/format-changed?\n",$url; $stats{failed_to_parse_portal_detail_page}++; &sleepy("waiting for 3 minutes, will retry then",(180+rand(10))) if ($opt_dont_retry == 0); } } until (($seen_programme> 0) || ($opt_dont_retry>0)); } ###################################################################################################### # populate cache sub read_cache { if (-r $opt_cache_file) { local (@ARGV, $/) = ($opt_cache_file); no warnings 'all'; eval <>; die "$@" if $@; } else { printf STDERR "WARNING: no programme cache $opt_cache_file - have to fetch all details\n"; } } ###################################################################################################### # write out updated cache sub write_cache { if (!(open(F,">$opt_cache_file"))) { printf STDERR "WARNING: could not write cache file $opt_cache_file: $!\n"; printf STDERR "Please fix this in order to reduce the number of queries for data!\n"; } else { # cleanup old entries from cache for my $cache_key (keys %{$data_cache}) { my ($starttime, $endtime, $channel, $progname) = split(/,/,$cache_key); delete $data_cache->{$cache_key} if ($starttime < time); $stats{removed_items_from_cache}++; } print F Data::Dumper->Dump([$data_cache], ["data_cache"]); close F; } } ###################################################################################################### sub add_cached_data { local($channel,$starttime,$cache_key) = @_; $tv_guide->{$channel}->{data}->{$starttime}->{'sub-title'} = [[ $data_cache->{$cache_key}->{subtitle}, $lang ]] if $data_cache->{$cache_key}->{subtitle}; $tv_guide->{$channel}->{data}->{$starttime}->{'desc'} = [[ $data_cache->{$cache_key}->{desc}, $lang ]] if $data_cache->{$cache_key}->{desc}; $tv_guide->{$channel}->{data}->{$starttime}->{'category'} = [[ $data_cache->{$cache_key}->{genre}, $lang ]] if $data_cache->{$cache_key}->{genre}; } ###################################################################################################### sub sleepy { local($logmsg,$sleeptimer) = @_; $stats{slept_for} += $sleeptimer; &log($logmsg); sleep($sleeptimer); } ######################################################################################################