From 8091b1c792342fea895a062a38c2f827bca0799f Mon Sep 17 00:00:00 2001
From: Jacek Kowalski <Jacek@jacekk.info>
Date: Mon, 25 Jun 2012 15:37:57 +0000
Subject: [PATCH] Zmiana sposobu pobierania programu telewizyjnego w związku ze zmianami na stronie wp.pl oraz poprawa linku do kanału RSS Dziennika Internautów.
---
data/tv/pobierz.php | 96 ++++++-----------------
data/tv/wp_parse.php | 135 +++++++++++++++++++++++++++++++++
data/rss/channels.list | 4
3 files changed, 162 insertions(+), 73 deletions(-)
diff --git a/data/rss/channels.list b/data/rss/channels.list
index 71afd81..fcc4a4d 100644
--- a/data/rss/channels.list
+++ b/data/rss/channels.list
@@ -3,10 +3,10 @@
http://wiadomosci.wp.pl/ver,rss,rss.xml wp w,wirtualna polska,wp.pl WP.pl
http://kanaly.rss.interia.pl/fakty.xml interia i,interia.pl Interia.pl
http://rss.gazeta.pl/pub/rss/wiadomosci.xml gazeta g,gazeta.pl Gazeta.pl
-http://rss.di.com.pl/di.rss di dziennik internautow Dziennik Internautów
+http://feeds.feedburner.com/glowny-di di dziennik internautow Dziennik Internautów
http://hacking.pl/rss.xml hacking h,hacking.pl Hacking.pl
http://linuxnews.pl/feed/ linuxnews l,linux,ln,linux news,linuxnews.pl Linux News
http://rss.bankier.pl/wiadomosci/wiadomosci.xml bankier bankier.pl Bankier.pl
http://bash.org.pl/rss/ bash b,sh,bash.org,bash.org.pl bash.org.pl
-# http://a.pl/rss.xml a NULL A.pl
\ No newline at end of file
+# http://a.pl/rss.xml a NULL A.pl
diff --git a/data/tv/pobierz.php b/data/tv/pobierz.php
index 5192250..279e08f 100644
--- a/data/tv/pobierz.php
+++ b/data/tv/pobierz.php
@@ -1,4 +1,6 @@
<?php
+require_once('wp_parse.php');
+
echo STAR.'Pobieranie programu TV...';
$stations = array(
1 => 'TVP 1',
@@ -63,84 +65,36 @@
fwrite($out, '<?xml version="1.0" encoding="UTF-8" ?>
<tv date="'.date('YmdHis O').'" generator-info-name="BotGG" generator-info-url="http://jacekk.info/botgg">
');
-$address = 'http://tv.wp.pl/program.html?stid=$STATION&date=$DATE&time=';
+$address = 'http://tv.wp.pl/program.html?stid=$STATION';
+$date = date('Y-m-d');
$counter = 0;
foreach($stations as $num => $station) {
- fwrite($out, ' <channel id="'.$station.'">
- <display-name>'.$station.'</display-name>
- </channel>
-');
- for($i=0; $i<7; $i++) {
- echo "\r".STAR.'Pobieranie programu TV: '.floor(($counter*7 + $i)/$NUMOF*100).'%';
-
- $timestamp = strtotime('+'.$i.' days');
- $date = date('Y-m-d', $timestamp);
- if(!file_exists('./cache/'.$num.'_'.$date) || filesize('./cache/'.$num.'_'.$date)==0) {
- curl_setopt($c, CURLOPT_URL, str_replace(array('$DATE', '$STATION'), array($date, $num), $address));
- curl_setopt($c, CURLOPT_CONNECTTIMEOUT, 30);
- curl_setopt($c, CURLOPT_FOLLOWLOCATION, TRUE);
- curl_setopt($c, CURLOPT_MAXREDIRS, 5);
- curl_setopt($c, CURLOPT_HTTPHEADER, array('User-Agent: Mozilla/5.0 (Windows; U; Windows NT 5.2; pl-PL; rv:1.9.2) Gecko/20100101 Firefox/3.6'));
- curl_setopt($c, CURLOPT_RETURNTRANSFER, TRUE);
- $data = curl_exec($c);
- if(!$data) {
- echo FAIL;
- return;
- }
-
- $data = str_replace(array('id="C_TSR-franc"', 'id="C_TSR-2-franc"', 'id="stationId"', 'id="searchForm"', '&'), array('', '', '', '', '&'), $data);
-
- file_put_contents('./cache/'.$num.'_'.$date, $data);
- unset($data);
+ echo "\r".STAR.'Pobieranie programu TV: '.floor($counter/$NUMOF*100).'%';
+
+ if(!file_exists('./cache/'.$num.'_'.$date) || filesize('./cache/'.$num.'_'.$date)==0) {
+ curl_setopt($c, CURLOPT_URL, str_replace(array('$DATE', '$STATION'), array($date, $num), $address));
+ curl_setopt($c, CURLOPT_CONNECTTIMEOUT, 30);
+ curl_setopt($c, CURLOPT_FOLLOWLOCATION, TRUE);
+ curl_setopt($c, CURLOPT_MAXREDIRS, 5);
+ curl_setopt($c, CURLOPT_HTTPHEADER, array('User-Agent: Mozilla/5.0 (Windows; U; Windows NT 5.2; pl-PL; rv:1.9.2) Gecko/20100101 Firefox/3.6'));
+ curl_setopt($c, CURLOPT_RETURNTRANSFER, TRUE);
+ $data = curl_exec($c);
+ if(!$data) {
+ echo FAIL;
+ return;
}
- $doc = new DOMDocument;
- $doc->loadHTMLFile('./cache/'.$num.'_'.$date);
- $doc = $doc->getElementById('bxNazwaBoksu')->childNodes;
-
- foreach($doc as $el) {
- if($el instanceof DOMElement) {
- $doc = $el->childNodes;
- break;
- }
- }
-
- $last_time = 0;
- $last_timestamp = 0;
- foreach($doc as $el) {
- if(!$el instanceof DOMElement || substr($el->getAttribute('class'), 0, 7)!='program') continue;
-
- $time = $el->getElementsByTagName('strong')->item(0)->childNodes->item(0)->nodeValue;
- $time = trim($time);
- if($last_time>(int)$time) {
- $timestamp = strtotime('+1 day', $timestamp);
- }
- $last_time = (int)$time;
- $timestamp = strtotime($time, $timestamp);
-
- if($last_timestamp) {
- fwrite($out, ' <programme channel="'.$station.'" start="'.date('YmdHis O', $last_timestamp).'" stop="'.date('YmdHis O', $timestamp).'">
- <title>'.$name.'</title>
- <desc/>
- </programme>
-');
- }
-
- $name = $el->getElementsByTagName('h4')->item(0)->childNodes->item(0)->childNodes->item(0)->nodeValue;
- $name = htmlspecialchars(trim($name), ENT_COMPAT, 'UTF-8');
- $last_timestamp = $timestamp;
- }
-
- fwrite($out, ' <programme channel="'.$station.'" start="'.date('YmdHis O', $timestamp).'" stop="'.date('YmdHis O', $timestamp+3600).'">
- <title>'.$name.'</title>
- <desc/>
- </programme>
-');
-
- unset($doc);
+ file_put_contents('./cache/'.$num.'_'.$date, $data);
+ unset($data);
}
+ $doc = new DOMDocument;
+ @$doc->loadHTMLFile('./cache/'.$num.'_'.$date);
+
+ $wp = new wp_parse($doc);
+ $wp->xmltv($station, $out);
+
$counter++;
}
diff --git a/data/tv/wp_parse.php b/data/tv/wp_parse.php
new file mode 100644
index 0000000..f3cb7c7
--- /dev/null
+++ b/data/tv/wp_parse.php
@@ -0,0 +1,135 @@
+<?php
+class wp_parse {
+ var $document, $xpath, $context;
+ var $name = '';
+ var $program = array();
+ var $months = array(
+ 'stycznia' => 1,
+ 'lutego' => 2,
+ 'marca' => 3,
+ 'kwietnia' => 4,
+ 'maja' => 5,
+ 'czerwca' => 6,
+ 'lipca' => 7,
+ 'sierpnia' => 8,
+ 'września' => 9,
+ 'października' => 10,
+ 'listopada' => 11,
+ 'grudnia' => 12,
+ );
+ var $weekdays = array(
+ 'poniedziałek' => 'next Monday',
+ 'wtorek' => 'next Tuesday',
+ 'środa' => 'next Wednesday',
+ 'czwartek' => 'next Thursday',
+ 'piątek' => 'next Friday',
+ 'sobota' => 'next Saturday',
+ 'niedziela' => 'next Sunday',
+ );
+
+ function __construct(DOMDocument $document) {
+ $this->document = $document;
+ $this->xpath = new DOMXPath($this->document);
+
+ $context = $this->xpath->query('//div[@class="ramowka"]');
+ if($context->length != 1) {
+ throw new Exception('Nie znaleziono ramówki!');
+ }
+ $this->context = $context->item(0);
+
+ $name = $this->xpath->query('.//h2[@class="sh2"]//span//text()', $this->context);
+ if($name->length != 1) {
+ throw new Exception('Nie znaleziono nazwy stacji, błędny HTML.');
+ }
+ $this->name = $name->item(0)->nodeValue;
+ }
+
+ function parse_date($date) {
+ if($date == 'dzisiaj') {
+ // 'dzisiaj'
+ return mktime(0, 0, 0);
+ }
+ elseif(isset($this->weekdays[$date])) {
+ // data przyszła: 'poniedziałek'
+ return strtotime($this->weekdays[$date]);
+ }
+ else
+ {
+ // data przeszła: 'pon. 18 czerwca'
+ $date = explode(' ', $date);
+ if(!isset($this->months[$date[2]])) {
+ throw new Exception('Nie udało się przetworzyć daty ('.$date[2].')');
+ }
+ $timestamp = mktime(0, 0, 0, $this->months[$date[2]], $date[1]);
+
+ // Należy przesunąć się o rok
+ if($timestamp > time()) {
+ $timestamp = strtotime('-1 year', $timestamp);
+ }
+
+ return $timestamp;
+ }
+ }
+
+ function xmltv($id, $fp) {
+ $program = array();
+
+ $days_dom = $this->xpath->query('.//ul[@class="lsDay"]//li', $this->context);
+ $days = array();
+ foreach($days_dom as $day) {
+ $days[] = $this->parse_date($day->nodeValue);
+ $program[] = array();
+ }
+ unset($days_dom, $day);
+
+ $hours_dom = $this->xpath->query('.//div[@class="hrsOut"]//div[@class="hour"]', $this->context);
+ // Kolejne wiersze (pełne godziny)
+ foreach($hours_dom as $in => $hour) {
+ $days_dom = $this->xpath->query('.//div[@class="col"]', $hour);
+ // Zbiory programów w tych godzinach dla kolejnych dni
+ foreach($days_dom as $num => $day) {
+ $programs_dom = $this->xpath->query('.//div[@class="prog"]', $day);
+ // Kolejne programy w danej godzinie i dniu
+ foreach($programs_dom as $n => $programs) {
+ $godzina = $this->xpath->query('.//div[@class="tm"]', $programs)->item(0)->textContent;
+ $nazwa = $this->xpath->query('.//h3', $programs)->item(0)->textContent;
+ $opis = $this->xpath->query('.//p', $programs)->item(0)->textContent;
+
+ $program[$num][] = array($godzina, $nazwa, $opis);
+ }
+ unset($programs_dom, $programs);
+ }
+ unset($days_dom, $day);
+ }
+ unset($hours_dom, $hour, $godzina, $nazwa, $opis);
+
+ fwrite($fp, "\t".'<channel id="'.$id.'">'."\n"
+ ."\t\t".'<display-name>'.htmlspecialchars($this->name).'</display-name>'."\n"
+ ."\t".'</channel>'."\n");
+
+ $last_timestamp = $timestamp = $days[0];
+ $last_prog = NULL;
+ foreach($program as $day => $dayprog) {
+ foreach($dayprog as $prog) {
+ $timestamp = strtotime($prog[0], $last_timestamp);
+ if($timestamp < $last_timestamp) {
+ $timestamp = strtotime('+1 day', $timestamp);
+ }
+ while($timestamp < $days[$day]) {
+ $timestamp = strtotime('+1 day', $timestamp);
+ }
+
+ if($program !== NULL)
+ fwrite($fp, "\t".'<programme channel="'.$id.'" start="'.date('YmdHis O', $last_timestamp).'"'
+ .' stop="'.date('YmdHis O', $timestamp).'">'."\n"
+ ."\t\t".'<title>'.htmlspecialchars($last_prog[1]).'</title>'."\n"
+ ."\t\t".'<desc>'.htmlspecialchars($last_prog[2]).'</desc>'."\n"
+ ."\t".'</programme>'."\n");
+
+ $last_prog = $prog;
+ $last_timestamp = $timestamp;
+ }
+ }
+ }
+}
+?>
--
Gitblit v1.9.1