From 8091b1c792342fea895a062a38c2f827bca0799f Mon Sep 17 00:00:00 2001 From: Jacek Kowalski <Jacek@jacekk.info> Date: Mon, 25 Jun 2012 15:37:57 +0000 Subject: [PATCH] Zmiana sposobu pobierania programu telewizyjnego w związku ze zmianami na stronie wp.pl oraz poprawa linku do kanału RSS Dziennika Internautów. --- data/tv/pobierz.php | 96 ++++++----------------- data/tv/wp_parse.php | 135 +++++++++++++++++++++++++++++++++ data/rss/channels.list | 4 3 files changed, 162 insertions(+), 73 deletions(-) diff --git a/data/rss/channels.list b/data/rss/channels.list index 71afd81..fcc4a4d 100644 --- a/data/rss/channels.list +++ b/data/rss/channels.list @@ -3,10 +3,10 @@ http://wiadomosci.wp.pl/ver,rss,rss.xml wp w,wirtualna polska,wp.pl WP.pl http://kanaly.rss.interia.pl/fakty.xml interia i,interia.pl Interia.pl http://rss.gazeta.pl/pub/rss/wiadomosci.xml gazeta g,gazeta.pl Gazeta.pl -http://rss.di.com.pl/di.rss di dziennik internautow Dziennik Internautów +http://feeds.feedburner.com/glowny-di di dziennik internautow Dziennik Internautów http://hacking.pl/rss.xml hacking h,hacking.pl Hacking.pl http://linuxnews.pl/feed/ linuxnews l,linux,ln,linux news,linuxnews.pl Linux News http://rss.bankier.pl/wiadomosci/wiadomosci.xml bankier bankier.pl Bankier.pl http://bash.org.pl/rss/ bash b,sh,bash.org,bash.org.pl bash.org.pl -# http://a.pl/rss.xml a NULL A.pl \ No newline at end of file +# http://a.pl/rss.xml a NULL A.pl diff --git a/data/tv/pobierz.php b/data/tv/pobierz.php index 5192250..279e08f 100644 --- a/data/tv/pobierz.php +++ b/data/tv/pobierz.php @@ -1,4 +1,6 @@ <?php +require_once('wp_parse.php'); + echo STAR.'Pobieranie programu TV...'; $stations = array( 1 => 'TVP 1', @@ -63,84 +65,36 @@ fwrite($out, '<?xml version="1.0" encoding="UTF-8" ?> <tv date="'.date('YmdHis O').'" generator-info-name="BotGG" generator-info-url="http://jacekk.info/botgg"> '); -$address = 'http://tv.wp.pl/program.html?stid=$STATION&date=$DATE&time='; +$address = 'http://tv.wp.pl/program.html?stid=$STATION'; +$date = date('Y-m-d'); $counter = 0; foreach($stations as $num => $station) { - fwrite($out, ' <channel id="'.$station.'"> - <display-name>'.$station.'</display-name> - </channel> -'); - for($i=0; $i<7; $i++) { - echo "\r".STAR.'Pobieranie programu TV: '.floor(($counter*7 + $i)/$NUMOF*100).'%'; - - $timestamp = strtotime('+'.$i.' days'); - $date = date('Y-m-d', $timestamp); - if(!file_exists('./cache/'.$num.'_'.$date) || filesize('./cache/'.$num.'_'.$date)==0) { - curl_setopt($c, CURLOPT_URL, str_replace(array('$DATE', '$STATION'), array($date, $num), $address)); - curl_setopt($c, CURLOPT_CONNECTTIMEOUT, 30); - curl_setopt($c, CURLOPT_FOLLOWLOCATION, TRUE); - curl_setopt($c, CURLOPT_MAXREDIRS, 5); - curl_setopt($c, CURLOPT_HTTPHEADER, array('User-Agent: Mozilla/5.0 (Windows; U; Windows NT 5.2; pl-PL; rv:1.9.2) Gecko/20100101 Firefox/3.6')); - curl_setopt($c, CURLOPT_RETURNTRANSFER, TRUE); - $data = curl_exec($c); - if(!$data) { - echo FAIL; - return; - } - - $data = str_replace(array('id="C_TSR-franc"', 'id="C_TSR-2-franc"', 'id="stationId"', 'id="searchForm"', '&'), array('', '', '', '', '&'), $data); - - file_put_contents('./cache/'.$num.'_'.$date, $data); - unset($data); + echo "\r".STAR.'Pobieranie programu TV: '.floor($counter/$NUMOF*100).'%'; + + if(!file_exists('./cache/'.$num.'_'.$date) || filesize('./cache/'.$num.'_'.$date)==0) { + curl_setopt($c, CURLOPT_URL, str_replace(array('$DATE', '$STATION'), array($date, $num), $address)); + curl_setopt($c, CURLOPT_CONNECTTIMEOUT, 30); + curl_setopt($c, CURLOPT_FOLLOWLOCATION, TRUE); + curl_setopt($c, CURLOPT_MAXREDIRS, 5); + curl_setopt($c, CURLOPT_HTTPHEADER, array('User-Agent: Mozilla/5.0 (Windows; U; Windows NT 5.2; pl-PL; rv:1.9.2) Gecko/20100101 Firefox/3.6')); + curl_setopt($c, CURLOPT_RETURNTRANSFER, TRUE); + $data = curl_exec($c); + if(!$data) { + echo FAIL; + return; } - $doc = new DOMDocument; - $doc->loadHTMLFile('./cache/'.$num.'_'.$date); - $doc = $doc->getElementById('bxNazwaBoksu')->childNodes; - - foreach($doc as $el) { - if($el instanceof DOMElement) { - $doc = $el->childNodes; - break; - } - } - - $last_time = 0; - $last_timestamp = 0; - foreach($doc as $el) { - if(!$el instanceof DOMElement || substr($el->getAttribute('class'), 0, 7)!='program') continue; - - $time = $el->getElementsByTagName('strong')->item(0)->childNodes->item(0)->nodeValue; - $time = trim($time); - if($last_time>(int)$time) { - $timestamp = strtotime('+1 day', $timestamp); - } - $last_time = (int)$time; - $timestamp = strtotime($time, $timestamp); - - if($last_timestamp) { - fwrite($out, ' <programme channel="'.$station.'" start="'.date('YmdHis O', $last_timestamp).'" stop="'.date('YmdHis O', $timestamp).'"> - <title>'.$name.'</title> - <desc/> - </programme> -'); - } - - $name = $el->getElementsByTagName('h4')->item(0)->childNodes->item(0)->childNodes->item(0)->nodeValue; - $name = htmlspecialchars(trim($name), ENT_COMPAT, 'UTF-8'); - $last_timestamp = $timestamp; - } - - fwrite($out, ' <programme channel="'.$station.'" start="'.date('YmdHis O', $timestamp).'" stop="'.date('YmdHis O', $timestamp+3600).'"> - <title>'.$name.'</title> - <desc/> - </programme> -'); - - unset($doc); + file_put_contents('./cache/'.$num.'_'.$date, $data); + unset($data); } + $doc = new DOMDocument; + @$doc->loadHTMLFile('./cache/'.$num.'_'.$date); + + $wp = new wp_parse($doc); + $wp->xmltv($station, $out); + $counter++; } diff --git a/data/tv/wp_parse.php b/data/tv/wp_parse.php new file mode 100644 index 0000000..f3cb7c7 --- /dev/null +++ b/data/tv/wp_parse.php @@ -0,0 +1,135 @@ +<?php +class wp_parse { + var $document, $xpath, $context; + var $name = ''; + var $program = array(); + var $months = array( + 'stycznia' => 1, + 'lutego' => 2, + 'marca' => 3, + 'kwietnia' => 4, + 'maja' => 5, + 'czerwca' => 6, + 'lipca' => 7, + 'sierpnia' => 8, + 'września' => 9, + 'października' => 10, + 'listopada' => 11, + 'grudnia' => 12, + ); + var $weekdays = array( + 'poniedziałek' => 'next Monday', + 'wtorek' => 'next Tuesday', + 'środa' => 'next Wednesday', + 'czwartek' => 'next Thursday', + 'piątek' => 'next Friday', + 'sobota' => 'next Saturday', + 'niedziela' => 'next Sunday', + ); + + function __construct(DOMDocument $document) { + $this->document = $document; + $this->xpath = new DOMXPath($this->document); + + $context = $this->xpath->query('//div[@class="ramowka"]'); + if($context->length != 1) { + throw new Exception('Nie znaleziono ramówki!'); + } + $this->context = $context->item(0); + + $name = $this->xpath->query('.//h2[@class="sh2"]//span//text()', $this->context); + if($name->length != 1) { + throw new Exception('Nie znaleziono nazwy stacji, błędny HTML.'); + } + $this->name = $name->item(0)->nodeValue; + } + + function parse_date($date) { + if($date == 'dzisiaj') { + // 'dzisiaj' + return mktime(0, 0, 0); + } + elseif(isset($this->weekdays[$date])) { + // data przyszła: 'poniedziałek' + return strtotime($this->weekdays[$date]); + } + else + { + // data przeszła: 'pon. 18 czerwca' + $date = explode(' ', $date); + if(!isset($this->months[$date[2]])) { + throw new Exception('Nie udało się przetworzyć daty ('.$date[2].')'); + } + $timestamp = mktime(0, 0, 0, $this->months[$date[2]], $date[1]); + + // Należy przesunąć się o rok + if($timestamp > time()) { + $timestamp = strtotime('-1 year', $timestamp); + } + + return $timestamp; + } + } + + function xmltv($id, $fp) { + $program = array(); + + $days_dom = $this->xpath->query('.//ul[@class="lsDay"]//li', $this->context); + $days = array(); + foreach($days_dom as $day) { + $days[] = $this->parse_date($day->nodeValue); + $program[] = array(); + } + unset($days_dom, $day); + + $hours_dom = $this->xpath->query('.//div[@class="hrsOut"]//div[@class="hour"]', $this->context); + // Kolejne wiersze (pełne godziny) + foreach($hours_dom as $in => $hour) { + $days_dom = $this->xpath->query('.//div[@class="col"]', $hour); + // Zbiory programów w tych godzinach dla kolejnych dni + foreach($days_dom as $num => $day) { + $programs_dom = $this->xpath->query('.//div[@class="prog"]', $day); + // Kolejne programy w danej godzinie i dniu + foreach($programs_dom as $n => $programs) { + $godzina = $this->xpath->query('.//div[@class="tm"]', $programs)->item(0)->textContent; + $nazwa = $this->xpath->query('.//h3', $programs)->item(0)->textContent; + $opis = $this->xpath->query('.//p', $programs)->item(0)->textContent; + + $program[$num][] = array($godzina, $nazwa, $opis); + } + unset($programs_dom, $programs); + } + unset($days_dom, $day); + } + unset($hours_dom, $hour, $godzina, $nazwa, $opis); + + fwrite($fp, "\t".'<channel id="'.$id.'">'."\n" + ."\t\t".'<display-name>'.htmlspecialchars($this->name).'</display-name>'."\n" + ."\t".'</channel>'."\n"); + + $last_timestamp = $timestamp = $days[0]; + $last_prog = NULL; + foreach($program as $day => $dayprog) { + foreach($dayprog as $prog) { + $timestamp = strtotime($prog[0], $last_timestamp); + if($timestamp < $last_timestamp) { + $timestamp = strtotime('+1 day', $timestamp); + } + while($timestamp < $days[$day]) { + $timestamp = strtotime('+1 day', $timestamp); + } + + if($program !== NULL) + fwrite($fp, "\t".'<programme channel="'.$id.'" start="'.date('YmdHis O', $last_timestamp).'"' + .' stop="'.date('YmdHis O', $timestamp).'">'."\n" + ."\t\t".'<title>'.htmlspecialchars($last_prog[1]).'</title>'."\n" + ."\t\t".'<desc>'.htmlspecialchars($last_prog[2]).'</desc>'."\n" + ."\t".'</programme>'."\n"); + + $last_prog = $prog; + $last_timestamp = $timestamp; + } + } + } +} +?> -- Gitblit v1.9.1