From 8091b1c792342fea895a062a38c2f827bca0799f Mon Sep 17 00:00:00 2001
From: Jacek Kowalski <Jacek@jacekk.info>
Date: Mon, 25 Jun 2012 15:37:57 +0000
Subject: [PATCH] Zmiana sposobu pobierania programu telewizyjnego w związku ze zmianami na stronie wp.pl oraz poprawa linku do kanału RSS Dziennika Internautów.

---
 data/tv/pobierz.php    |   96 ++++++-----------------
 data/tv/wp_parse.php   |  135 +++++++++++++++++++++++++++++++++
 data/rss/channels.list |    4 
 3 files changed, 162 insertions(+), 73 deletions(-)

diff --git a/data/rss/channels.list b/data/rss/channels.list
index 71afd81..fcc4a4d 100644
--- a/data/rss/channels.list
+++ b/data/rss/channels.list
@@ -3,10 +3,10 @@
 http://wiadomosci.wp.pl/ver,rss,rss.xml		wp		w,wirtualna polska,wp.pl		WP.pl
 http://kanaly.rss.interia.pl/fakty.xml		interia		i,interia.pl				Interia.pl
 http://rss.gazeta.pl/pub/rss/wiadomosci.xml	gazeta		g,gazeta.pl				Gazeta.pl
-http://rss.di.com.pl/di.rss			di		dziennik internautow			Dziennik Internautów
+http://feeds.feedburner.com/glowny-di		di		dziennik internautow			Dziennik Internautów
 http://hacking.pl/rss.xml			hacking		h,hacking.pl				Hacking.pl
 http://linuxnews.pl/feed/			linuxnews	l,linux,ln,linux news,linuxnews.pl	Linux News
 http://rss.bankier.pl/wiadomosci/wiadomosci.xml	bankier		bankier.pl				Bankier.pl
 http://bash.org.pl/rss/				bash		b,sh,bash.org,bash.org.pl		bash.org.pl
 
-# http://a.pl/rss.xml				a		NULL					A.pl
\ No newline at end of file
+# http://a.pl/rss.xml				a		NULL					A.pl
diff --git a/data/tv/pobierz.php b/data/tv/pobierz.php
index 5192250..279e08f 100644
--- a/data/tv/pobierz.php
+++ b/data/tv/pobierz.php
@@ -1,4 +1,6 @@
 <?php
+require_once('wp_parse.php');
+
 echo STAR.'Pobieranie programu TV...';
 $stations = array(
 	1 => 'TVP 1',
@@ -63,84 +65,36 @@
 fwrite($out, '<?xml version="1.0" encoding="UTF-8" ?>
 <tv date="'.date('YmdHis O').'" generator-info-name="BotGG" generator-info-url="http://jacekk.info/botgg">
 ');
-$address = 'http://tv.wp.pl/program.html?stid=$STATION&date=$DATE&time=';
+$address = 'http://tv.wp.pl/program.html?stid=$STATION';
+$date = date('Y-m-d');
 
 $counter = 0;
 foreach($stations as $num => $station) {
-	fwrite($out, '	<channel id="'.$station.'">
-		<display-name>'.$station.'</display-name>
-	</channel>
-');
-	for($i=0; $i<7; $i++) {
-		echo "\r".STAR.'Pobieranie programu TV: '.floor(($counter*7 + $i)/$NUMOF*100).'%';
-		
-		$timestamp = strtotime('+'.$i.' days');
-		$date = date('Y-m-d', $timestamp);
-		if(!file_exists('./cache/'.$num.'_'.$date) || filesize('./cache/'.$num.'_'.$date)==0) {
-			curl_setopt($c, CURLOPT_URL, str_replace(array('$DATE', '$STATION'), array($date, $num), $address));
-			curl_setopt($c, CURLOPT_CONNECTTIMEOUT, 30);
-			curl_setopt($c, CURLOPT_FOLLOWLOCATION, TRUE);
-			curl_setopt($c, CURLOPT_MAXREDIRS, 5);
-			curl_setopt($c, CURLOPT_HTTPHEADER, array('User-Agent: Mozilla/5.0 (Windows; U; Windows NT 5.2; pl-PL; rv:1.9.2) Gecko/20100101 Firefox/3.6'));
-			curl_setopt($c, CURLOPT_RETURNTRANSFER, TRUE);
-			$data = curl_exec($c);
-			if(!$data) {
-				echo FAIL;
-				return;
-			}
-			
-			$data = str_replace(array('id="C_TSR-franc"', 'id="C_TSR-2-franc"', 'id="stationId"', 'id="searchForm"', '&'), array('', '', '', '', '&amp;'), $data);
-			
-			file_put_contents('./cache/'.$num.'_'.$date, $data);
-			unset($data);
+	echo "\r".STAR.'Pobieranie programu TV: '.floor($counter/$NUMOF*100).'%';
+	
+	if(!file_exists('./cache/'.$num.'_'.$date) || filesize('./cache/'.$num.'_'.$date)==0) {
+		curl_setopt($c, CURLOPT_URL, str_replace(array('$DATE', '$STATION'), array($date, $num), $address));
+		curl_setopt($c, CURLOPT_CONNECTTIMEOUT, 30);
+		curl_setopt($c, CURLOPT_FOLLOWLOCATION, TRUE);
+		curl_setopt($c, CURLOPT_MAXREDIRS, 5);
+		curl_setopt($c, CURLOPT_HTTPHEADER, array('User-Agent: Mozilla/5.0 (Windows; U; Windows NT 5.2; pl-PL; rv:1.9.2) Gecko/20100101 Firefox/3.6'));
+		curl_setopt($c, CURLOPT_RETURNTRANSFER, TRUE);
+		$data = curl_exec($c);
+		if(!$data) {
+			echo FAIL;
+			return;
 		}
 		
-		$doc = new DOMDocument;
-		$doc->loadHTMLFile('./cache/'.$num.'_'.$date);
-		$doc = $doc->getElementById('bxNazwaBoksu')->childNodes;
-		
-		foreach($doc as $el) {
-			if($el instanceof DOMElement) {
-				$doc = $el->childNodes;
-				break;
-			}
-		}
-		
-		$last_time = 0;
-		$last_timestamp = 0;
-		foreach($doc as $el) {
-			if(!$el instanceof DOMElement || substr($el->getAttribute('class'), 0, 7)!='program') continue;
-			
-			$time = $el->getElementsByTagName('strong')->item(0)->childNodes->item(0)->nodeValue;
-			$time = trim($time);
-			if($last_time>(int)$time) {
-				$timestamp = strtotime('+1 day', $timestamp);
-			}
-			$last_time = (int)$time;
-			$timestamp = strtotime($time, $timestamp);
-			
-			if($last_timestamp) {
-				fwrite($out, '	<programme channel="'.$station.'" start="'.date('YmdHis O', $last_timestamp).'" stop="'.date('YmdHis O', $timestamp).'">
-		<title>'.$name.'</title>
-		<desc/>
-	</programme>
-');
-			}
-			
-			$name = $el->getElementsByTagName('h4')->item(0)->childNodes->item(0)->childNodes->item(0)->nodeValue;
-			$name = htmlspecialchars(trim($name), ENT_COMPAT, 'UTF-8');
-			$last_timestamp = $timestamp;
-		}
-		
-		fwrite($out, '	<programme channel="'.$station.'" start="'.date('YmdHis O', $timestamp).'" stop="'.date('YmdHis O', $timestamp+3600).'">
-		<title>'.$name.'</title>
-		<desc/>
-	</programme>
-');
-		
-		unset($doc);
+		file_put_contents('./cache/'.$num.'_'.$date, $data);
+		unset($data);
 	}
 	
+	$doc = new DOMDocument;
+	@$doc->loadHTMLFile('./cache/'.$num.'_'.$date);
+	
+	$wp = new wp_parse($doc);
+	$wp->xmltv($station, $out);
+	
 	$counter++;
 }
 
diff --git a/data/tv/wp_parse.php b/data/tv/wp_parse.php
new file mode 100644
index 0000000..f3cb7c7
--- /dev/null
+++ b/data/tv/wp_parse.php
@@ -0,0 +1,135 @@
+<?php
+class wp_parse {
+	var $document, $xpath, $context;
+	var $name = '';
+	var $program = array();
+	var $months = array(
+			'stycznia' => 1,
+			'lutego' => 2,
+			'marca' => 3,
+			'kwietnia' => 4,
+			'maja' => 5,
+			'czerwca' => 6,
+			'lipca' => 7,
+			'sierpnia' => 8,
+			'września' => 9,
+			'października' => 10,
+			'listopada' => 11,
+			'grudnia' => 12,
+		);
+	var $weekdays = array(
+			'poniedziałek' => 'next Monday',
+			'wtorek' => 'next Tuesday',
+			'środa' => 'next Wednesday',
+			'czwartek' => 'next Thursday',
+			'piątek' => 'next Friday',
+			'sobota' => 'next Saturday',
+			'niedziela' => 'next Sunday',
+		);
+	
+	function __construct(DOMDocument $document) {
+		$this->document = $document;
+		$this->xpath = new DOMXPath($this->document);
+		
+		$context = $this->xpath->query('//div[@class="ramowka"]');
+		if($context->length != 1) {
+			throw new Exception('Nie znaleziono ramówki!');
+		}
+		$this->context = $context->item(0);
+		
+		$name = $this->xpath->query('.//h2[@class="sh2"]//span//text()', $this->context);
+		if($name->length != 1) {
+			throw new Exception('Nie znaleziono nazwy stacji, błędny HTML.');
+		}
+		$this->name = $name->item(0)->nodeValue;
+	}
+	
+	function parse_date($date) {
+		if($date == 'dzisiaj') {
+			// 'dzisiaj'
+			return mktime(0, 0, 0);
+		}
+		elseif(isset($this->weekdays[$date])) {
+			// data przyszła: 'poniedziałek'
+			return strtotime($this->weekdays[$date]);
+		}
+		else
+		{
+			// data przeszła: 'pon. 18 czerwca'
+			$date = explode(' ', $date);
+			if(!isset($this->months[$date[2]])) {
+				throw new Exception('Nie udało się przetworzyć daty ('.$date[2].')');
+			}
+			$timestamp = mktime(0, 0, 0, $this->months[$date[2]], $date[1]);
+			
+			// Należy przesunąć się o rok
+			if($timestamp > time()) {
+				$timestamp = strtotime('-1 year', $timestamp);
+			}
+			
+			return $timestamp;
+		}
+	}
+	
+	function xmltv($id, $fp) {
+		$program = array();
+		
+		$days_dom = $this->xpath->query('.//ul[@class="lsDay"]//li', $this->context);
+		$days = array();
+		foreach($days_dom as $day) {
+			$days[] = $this->parse_date($day->nodeValue);
+			$program[] = array();
+		}
+		unset($days_dom, $day);
+		
+		$hours_dom = $this->xpath->query('.//div[@class="hrsOut"]//div[@class="hour"]', $this->context);
+		// Kolejne wiersze (pełne godziny)
+		foreach($hours_dom as $in => $hour) {
+			$days_dom = $this->xpath->query('.//div[@class="col"]', $hour);
+			// Zbiory programów w tych godzinach dla kolejnych dni
+			foreach($days_dom as $num => $day) {
+				$programs_dom = $this->xpath->query('.//div[@class="prog"]', $day);
+				// Kolejne programy w danej godzinie i dniu
+				foreach($programs_dom as $n => $programs) {
+					$godzina = $this->xpath->query('.//div[@class="tm"]', $programs)->item(0)->textContent;
+					$nazwa = $this->xpath->query('.//h3', $programs)->item(0)->textContent;
+					$opis = $this->xpath->query('.//p', $programs)->item(0)->textContent;
+					
+					$program[$num][] = array($godzina, $nazwa, $opis);
+				}
+				unset($programs_dom, $programs);
+			}
+			unset($days_dom, $day);
+		}
+		unset($hours_dom, $hour, $godzina, $nazwa, $opis);
+		
+		fwrite($fp, "\t".'<channel id="'.$id.'">'."\n"
+			."\t\t".'<display-name>'.htmlspecialchars($this->name).'</display-name>'."\n"
+			."\t".'</channel>'."\n");
+		
+		$last_timestamp = $timestamp = $days[0];
+		$last_prog = NULL;
+		foreach($program as $day => $dayprog) {
+			foreach($dayprog as $prog) {
+				$timestamp = strtotime($prog[0], $last_timestamp);
+				if($timestamp < $last_timestamp) {
+					$timestamp = strtotime('+1 day', $timestamp);
+				}
+				while($timestamp < $days[$day]) {
+					$timestamp = strtotime('+1 day', $timestamp);
+				}
+
+				if($program !== NULL) 
+					fwrite($fp, "\t".'<programme channel="'.$id.'" start="'.date('YmdHis O', $last_timestamp).'"'
+						.' stop="'.date('YmdHis O', $timestamp).'">'."\n"
+						."\t\t".'<title>'.htmlspecialchars($last_prog[1]).'</title>'."\n"
+						."\t\t".'<desc>'.htmlspecialchars($last_prog[2]).'</desc>'."\n"
+						."\t".'</programme>'."\n");
+				
+				$last_prog = $prog;
+				$last_timestamp = $timestamp;
+			}
+		}
+	}
+}
+?>

--
Gitblit v1.9.1