Webscraper in PHP, -
through curl have parsed website. when run foreach loop take lastly value out of it, when intered in mysql db. understand insertion in db out of loop loop handle lastly value in db. when come in insert query within loop come in values in side each row. need values in 1 row. sample code here, have removed many fields.
hope, able explain regards rishabh :) illustration link : http://www.dvdempire.com/trending-blu-ray.html?page=1 , details scrap page : http://www.dvdempire.com/1699319/300-rise-of-an-empire-blu-ray-dvd-ultraviolet-blu-ray.html
<?php set_time_limit(0); ($x=1; $x<=2; $x++) { $useragent = 'googlebot/2.1 (http://www.googlebot.com/bot.html)'; sleep(1); $target_url = "http://www.dvdempire.com/trending-blu-ray.html?page=$x" ; $ch = curl_init(); curl_setopt($ch, curlopt_useragent, $useragent); curl_setopt($ch, curlopt_url,$target_url); curl_setopt($ch, curlopt_failonerror, true); curl_setopt($ch, curlopt_followlocation, true); curl_setopt($ch, curlopt_autoreferer, true); curl_setopt($ch, curlopt_returntransfer,true); curl_setopt($ch, curlopt_timeout, 13300000); $html= curl_exec($ch); if (!$html) { echo "<br />curl error number:" .curl_errno($ch); echo "<br />curl error:" . curl_error($ch); exit; } // parse html domdocument $dom = new domdocument(); @$dom->loadhtml($html); $xpath = new domxpath( $dom ); $query21 = '//div[@class="container"]//p[@class="title"]//a'; $nodes21 = $xpath->query( $query21 ); foreach( $nodes21 $node21 ) { $target_url12=$node21->getattribute('href'); $target_url1 ="http://www.dvdempire.com$target_url12" ; curl($target_url1) ; } } function curl($target_url1) { //sigle page scraping global $useragent ; $useragent = 'googlebot/2.1 (http://www.googlebot.com/bot.html)'; //$target_url1 = "http://www.dvdempire.com/1695169/3-days-to-kill-blu-ray-dvd-ultraviolet-blu-ray.html"; $ch1 = curl_init(); curl_setopt($ch1, curlopt_useragent, $useragent); curl_setopt($ch1, curlopt_url,$target_url1); curl_setopt($ch1, curlopt_failonerror, true); curl_setopt($ch1, curlopt_followlocation, true); curl_setopt($ch1, curlopt_autoreferer, true); curl_setopt($ch1, curlopt_returntransfer,true); curl_setopt($ch1, curlopt_timeout, 13300000); $html1= curl_exec($ch1); if (!$html1) { echo "<br />curl error number:" .curl_errno($ch1); echo "<br />curl error:" . curl_error($ch1); exit; } // parse html domdocument $dom = new domdocument(); @$dom->loadhtml($html1); $xpath = new domxpath( $dom ); $query1 = '//div[@id="generalinformation"]//div[@class="section synopsis"]//p'; // synopsis $query2 = '//div[@class="subsection"]//time[@itemprop="duration"]'; // length $query7 = '//div[@class="section cast"]//li[@itemprop="actor"]//span[@itemprop="name"]'; //cast $query8 = '//div[@class="section cast"]//li[@itemprop="producer"]//span[@itemprop="name"]'; // producer $query9 = '//div[@class="section cast"]//li[@itemprop="director"]//span[@itemprop="name"]'; // director $query10 = '//div[@class="section productinfo"]'; // upc code $nodes1 = $xpath->query( $query1 ); $nodes2 = $xpath->query( $query2 ); $nodes7 = $xpath->query( $query7 ); $nodes8 = $xpath->query( $query8 ); $nodes9 = $xpath->query( $query9 ); $nodes10 = $xpath->query( $query10 ); //synopsis echo "--------------- synopsis --------------- " ; foreach( $nodes1 $node1 ) { echo $a1=$node1->nodevalue; } echo "<br>" ; // length echo "--------------- length --------------- " ; foreach( $nodes2 $node2 ) { echo $a2=$node2->nodevalue; } echo "<br>" ; echo "-------------- cast --------------- " ; foreach( $nodes7 $node7 ) { $a7=$node7->nodevalue; echo $a7 = $a7.","; } echo "<br>" ; echo "-------------- producer --------------- " ; foreach( $nodes8 $node8 ) { $a8=$node8->nodevalue; echo $a8 = $a8.","; } echo "<br>" ; echo "-------------- director --------------- " ; foreach( $nodes9 $node9 ) { $a9=$node9->nodevalue; echo $a9 = $a9.","; } echo "<hr>" ;
// database insertion /*$con = mysql_connect("localhost","root",""); if (!$con) { die('could not connect: ' . mysql_error()); } mysql_select_db("dvd", $con);
$sql="insert info (synopsis,length,rating,cast,producer,director) values ('$a1','$a2','$a7','$a8','$a9')"; if (!mysql_query($sql,$con)) { die('error: ' . mysql_error()); } mysql_close($con) ; */
} ?>
this bit rough, this. although utilize prepared statements using pdo, not mysql_* functions deprecated. thats topic time.
for now...this how in particular case...
$values=array(); foreach( $nodes1 $node1 ) { echo $a1=$node1->nodevalue; $values[]["synopsis"] = $a1; } echo "<br>" ; echo "--------------- length --------------- " ; foreach( $nodes2 $node2 ) { echo $a2=$node2->nodevalue; $values[]["length"] = $a2; } echo "<br>" ; echo "-------------- cast --------------- " ; foreach( $nodes7 $node7 ) { $a7=$node7->nodevalue; echo $a7 = $a7.","; $values[]["cast"] = $a7; } echo "<br>" ; echo "-------------- producer --------------- " ; foreach( $nodes8 $node8 ) { $a8=$node8->nodevalue; echo $a8 = $a8.","; $values[]["producer"] = $a8; } echo "<br>" ; echo "-------------- director --------------- " ; foreach( $nodes9 $node9 ) { $a9=$node9->nodevalue; echo $a9 = $a9.","; $values[]["director"] = $a9; } echo "<hr>" ;
then this...
$string=""; //loop through values , build query string foreach($values $v){ $string.="(".$v['synopsis'].",".$v['length'].",".$v['cast'].",".$v['producer'].",".$v['director']."),"; } //trim lastly comma off string rtrim($string,','); //append query , execute $sql="insert info (synopsis,length,cast,producer,director) values ".$string;
php
No comments:
Post a Comment