batch/1.1/old/getLogos.php

364 lines
17 KiB
PHP
Raw Normal View History

#!/usr/bin/php -c/var/www/batch/config/php_batch_sd.ini
<?php
include_once(FWK_PATH.'common/chiffres.php');
include_once(FWK_PATH.'common/dates.php');
include_once(FWK_PATH.'common/ftp.php');
include_once(INCLUDE_PATH.'insee/classMInsee.php');
include_once(INCLUDE_PATH.'partenaires/classMBilans.php');
include_once(FWK_PATH.'mail/sendMail.php');
$dateTime=date('YmdHis');
$strInfoScript='Usage : '.basename($argv[0]). " [OPTION]
2013-06-19 08:24:49 +00:00
R<EFBFBD>cuperer les logos non encore en base.
Options :
-b Rechercher les logo via Bing
-v Mode bavard ou debug
2013-06-19 08:24:49 +00:00
(*) Option par d<EFBFBD>faut si aucun argument n'est pass<EFBFBD>.
";
$bing=$modeDebug=false;
$iReprise=0;
for ($i=1; isset($argv[$i]); $i++) {
if (substr($argv[$i],0,1)=='-') {
switch (strtolower(substr($argv[$i],1,1))) {
case 'b': $bing=true; break;
case 'v': $modeDebug=true; break;
case '-':
case '?': die($strInfoScript); break;
default: die('Option '. $argv[$i] . ' inconnue !'.EOL); break;
}
}
}
if ($modeDebug)
2013-06-19 08:24:49 +00:00
echo date('Y/m/d - H:i:s') ." - DEBUT du programme de r<>cup<75>ration des Logos...".EOL;
$iDb=new WDB('telephonie');
$referer='';
$url="http://www.annuaire.com/dernieres-societes-inscrites/";
$tDeb=microtime(true);
$page=getUrl($url, '', '', $referer, false, '', '', 60);
$duree=round(microtime(true)-$tDeb,3);
$body=$page['body'];
$taille=round(strlen($body)/1024,1);
/* <li class="even">
<a href="http://www.annuaire.com/marchand-de-meubles/love-lit-kids-love-lit-kids-528117443/"><img src="http://www.annuaire.com/images/category/picto/13.gif" /></a>
<strong><a href="http://www.annuaire.com/marchand-de-meubles/love-lit-kids-love-lit-kids-528117443/">LOVE LIT KIDS (LOVE LIT KIDS)</a></strong><br />
<p>mobiliers et d&eacute;corations contemporain et vintage pour chambres d'enfants de...</p>
</li>
</ul>
</div>
</div>
<div id="right-space">
<div class="ad"><script charset="iso-8859-1" type="text/javascript" src="http://adnext.fr/richmedia.adv?popunderid=97318&amp;tag=2&amp;s=big&amp;section=last_register_company"></script></div>
<div class="side-interpanel-separator"></div>
<div class="side-panel">
<p class="side-panel-title"><img src="http://www.annuaire.com/images/annuaire-rss.png" alt="" /> Soci&eacute;t&eacute;s <span class="annuaire_blue">premium</span></p>
<ul class="side-company-list">
<li>
<a href="http://www.annuaire.com/agence-de-publicite/west-development-513135723/"><img src="http://www.annuaire.com/uploads/513/135/513135723/logo.jpg" /></a>
<strong><a href="http://www.annuaire.com/agence-de-publicite/west-development-513135723/">WEST DEVELOPMENT</a></strong><br />
<span class="city annuaire_blue">Vire</span><br />
<p>WEST DEVELOPMENT le sp&eacute;cialiste en sonorisation, &eacute;clairage et instrument de...</p>
</li>
<li>*/
if (!preg_match_all('/<li(?:.*)>(?:\s+)<a href="(http\:\/\/www\.annuaire\.com\/(?:.*)\/)"><img src="(http\:\/\/www\.annuaire\.com\/(.*))" \/><\/a>(?:\s+)<strong><a href=".*">(.*)<\/a><\/strong><br \/>(.*)<\/li>/Uis', $body, $matches))
die('Erreur de parsing des logos...');
foreach ($matches[1] as $i=>$urlFiche) {
$siren=str_replace('/','',substr($urlFiche,-10));
$urlLogo=$matches[2][$i];
2013-06-19 08:24:49 +00:00
/** On t<>l<EFBFBD>charge d'abord tous les logos, comme un navigateur **/
if (substr($urlLogo,0,32)=='http://www.annuaire.com/uploads/') {
$extension=substr(strrchr($urlLogo,'.'),1);
if (!file_exists("/home/data/logos/$siren.$extension")) {
$referer='';
$tDeb=microtime(true);
$page=getUrl($urlLogo, '', '', $referer, false, '', '', 60);
$duree=round(microtime(true)-$tDeb,3);
$body=$page['body'];
$taille=round(strlen($body)/1024,1);
file_put_contents("/home/data/logos/$siren.$extension",$body);
}
}
}
foreach ($matches[1] as $i=>$urlFiche) {
$siren=str_replace('/','',substr($urlFiche,-10));
$ret=$iDb->select('societe_ent', 'count(*) AS nb', "siren=$siren");
$nbDeja=$ret[0][0];
if ($nbDeja==0) {
$tabEntrep=array('siren'=>$siren);
$urlLogo=$matches[2][$i];
2013-06-19 08:24:49 +00:00
/** On t<>l<EFBFBD>charge d'abord tous les logos, comme un navigateur **/
if (substr($urlLogo,0,32)=='http://www.annuaire.com/uploads/' &&
file_exists("/home/data/logos/$siren.$extension"))
$tabEntrep['logo']=1;
else
$tabEntrep['logo']=0;
/* $urlLogo=$matches[2][$i];
if (substr($urlLogo,0,32)<>'http://www.annuaire.com/uploads/') {
$urlLogo='';
$tabEntrep['logo']=0;
}*/
$tabEntrep['nom']=$nom=$matches[4][$i];
$tabEntrep['descCourt']=$desc=@trim(html_entity_decode(strip_tags($matches[5][$i])));
2013-06-19 08:24:49 +00:00
/** T<EFBFBD>l<EFBFBD>chargement du logo **
if ($urlLogo<>'') {
$extension=substr(strrchr($urlLogo,'.'),1);
if (!file_exists("/home/data/logos/$siren.$extension")) {
$referer='';
$tDeb=microtime(true);
$page=getUrl($urlLogo, '', '', $referer, false, '', '', 60);
$duree=round(microtime(true)-$tDeb,3);
$body=$page['body'];
$taille=round(strlen($body)/1024,1);
if (file_put_contents("/home/data/logos/$siren.$extension",$body))
$tabEntrep['logo']=1;
} else
$tabEntrep['logo']=1;
}*/
$desc=$categ=$web='';
if ($i>0) {
/** Lecture de la fiche **/
$referer='';
$tDeb=microtime(true);
$page=getUrl($urlFiche, '', '', $referer, false, '', '', 60);
$duree=round(microtime(true)-$tDeb,3);
$body=$page['body'];
if (preg_match_all('/<li class="tel">(?:\s+)<label class="type" title="(.*)"(?:\s+)?>(.*)<\/label>(?:\s+)<span class="dot">\:<\/span>(?:\s+)<span class="value"(?:.*)?>(.*)<\/span>(.*)<\/li>/Uis', $body, $matches2)) {
//print_r($matches2);
foreach ($matches2[1] as $j=>$typeTel) {
$libTel=html_entity_decode($matches2[2][$j]);
$numTel=trim($matches2[3][$j]);
$infoTel=html_entity_decode(trim(strtr(strip_tags($matches2[4][$j]),array('('=>'',')'=>''))));
$tabInsert=array('siren'=>$siren,
'typeTel'=>$typeTel,
'libTel'=>$libTel,
'numTel'=>$numTel,
'infoTel'=>$infoTel,
'dateInsert'=>$dateTime,
);
if ($iDb->insert('societe_tel', $tabInsert))
echo "$dateTime\t$siren\t$nom\t$typeTel\t$libTel\t$numTel\t$infoTel".EOL;
}
}
$desc=$categ=$web='';
if (preg_match('/<div id="description">(?:\s+)<label class="description">(?:.*)<\/span>(?:\s+)<p>(.*)<\/p>(?:\s+)<\/div>/Uis', $body, $matches2))
$tabEntrep['descLong']=@trim(html_entity_decode(strip_tags($matches2[1])));
if (preg_match('/<div id="category">(?:\s+)<label>(?:.*)<\/span>(?:\s+)<p class="(.*)">(.*)<\/p>/Uis', $body, $matches2)) {
$tabEntrep['catTyp']=@trim($matches2[1]);
$tabEntrep['catLib']=$categ=@trim($matches2[2]);
}
if (preg_match('/<div id="site">(?:\s+)<label>(?:.*)<\/span>(?:\s+)<a class="url" target="_blank" href="(.*)">(?:.*)<\/div>/Uis', $body, $matches2))
$tabEntrep['web']=$web=@trim($matches2[1]);
// if (preg_match('/<div id="mail">(?:\s+)<label>(?:.*)<\/span>(?:\s+)<span class="mail"><img src="http://www.annuaire.com/images/email/48294778500000/" /></span><a class="url" target="_blank" href="(.*)">(?:.*)<\/div>', $body, $matches2))
// $web=$matches2[2];
}
$tabEntrep['dateInsert']=$dateTime;
if ($iDb->insert('societe_ent', $tabEntrep))
echo "$dateTime\t$siren\t$nom\t$urlLogo\t$desc\t$categ\t$web".EOL;
}
}
//print_r($matches);
if ($modeDebug)
2013-06-19 08:24:49 +00:00
echo date('Y/m/d - H:i:s') ." - FIN du programme de r<>cup<75>ration des Logos.".EOL;
if (!$bing) die();
// Recherche BING
$appId = '56D6CBA671C986D3EA11B1B48F97507BC5A00D51';
$numResults = 50;
$cultureInfo = 'fr-FR';
for($i=0; $i<1000; $i++) {
// for($j=522; $j<1000; $j++) {
// $siren2=implode(' ', str_split($siren, 3));
// if ($nomEntrep<>'') $rs2="OR \"$nomEntrep\"";
// $tabSitesExclus=array('societe.com','bilans.net','gouv.fr','info-financiere.fr','bodacc.fr','manageo.fr','bilansgratuits.fr','lesechos.fr','google.fr');
$sir1=sprintf("%03s", $i);
//$sir2=sprintf("%03s", $j);
$rs="$sir1 logo site:annuaire.com";
//511
// $rs="$siren OR \"$siren2\" $rs2 -site:".implode(' -site:', $tabSitesExclus);
$query=stripslashes(urlencode($rs));
$tabSources=array(// 'web'=>'Web page results',
'image'=>'Full-size image and thumbnail image information, including the file size in bytes (if available), height and width in pixels (if available), and the URI to the full-size image or thumbnail',
/*'instantAnswer'=>'Answers. The result fields returned for requests that specify InstantAnswer vary based on the value or values specified for the Query property. InstantAnswer results can include Encarta, FlightStatus, Finance, Music, Sports, Weather, and Movie ShowTimes. For the Version 2.0 release, results include Encarta and FlightStatus only. Other results are available by invitation',
'mobileWeb'=>'Mobile Web page results (primarily Extensible Hypertext Markup Language (XHTML) and Wireless Markup Language (WML)',
'phoneBook'=>'Results from online White Pages (residential) and Yellow Pages (commercial) entries',
'relatedSearch'=>'Suggestions for other searches related to the query term or terms',
'spell'=>'Spelling suggestions',
'translation'=>'Translated results for a queried',
'video'=>'Video results',*/
//'news'=>'Results from online news services',
);
$source=implode('+', array_keys($tabSources));
$offset=0;
while (true) {
$url="http://api.bing.net/json.aspx?AppId=$appId&Version=2.2&Market=$cultureInfo&Query=$query&Sources=$source&Image.Count=$numResults&Image.Offset=$offset&JsonType=raw";//&Adult=On";
$page=getUrl($url, '', '', $referer, false);
2013-06-19 08:24:49 +00:00
print_r($page);
die();//Mentions l<>gales L<>annonceur est l<><6C>diteur de ce site. PagesJaunes est le prestataire technique.
$json=$page['body'];
$tabJson=json_decode($json, true);
$tabJson=$tabJson['SearchResponse'];
foreach ($tabJson['Image']['Results'] as $j=>$tab) {
//print_r($tab);
$urlLogo=$tab['MediaUrl'];
$tmp=explode('/',str_replace('http://www.annuaire.com/uploads/','',$urlLogo));
$siren=$tmp[2];
if (substr($urlLogo,0,32)=='http://www.annuaire.com/uploads/') {
$extension=substr(strrchr($urlLogo,'.'),1);
if (!file_exists("/home/data/logos/$siren.$extension")) {
$referer='';
$tDeb=microtime(true);
$page=getUrl($urlLogo, '', '', $referer, false, '', '', 60);
$duree=round(microtime(true)-$tDeb,3);
$body=$page['body'];
$taille=round(strlen($body)/1024,1);
file_put_contents("/home/data/logos/$siren.$extension",$body);
echo "$sir1\t$offset\t$siren\t$urlLogo".EOL;
randsleep(1,2);
}
}
}
$offset+=$numResults;
if ($offset>$tabJson['Image']['Total']) break;
}
}
die();
function findSiteWeb($siren, $nomEntrep='') {
// Recherche BING
$appId = '56D6CBA671C986D3EA11B1B48F97507BC5A00D51';
$numResults = 50;
$cultureInfo = 'fr-FR';
$siren2=implode(' ', str_split($siren, 3));
if ($nomEntrep<>'') $rs2="OR \"$nomEntrep\"";
$tabSitesExclus=array('societe.com','bilans.net','gouv.fr','info-financiere.fr','bodacc.fr','manageo.fr','bilansgratuits.fr','lesechos.fr','google.fr');
$rs="$siren OR \"$siren2\" $rs2 -site:".implode(' -site:', $tabSitesExclus);
$query=stripslashes(urlencode($rs));
$tabSources=array( 'web'=>'Web page results',
/*'image'=>'Full-size image and thumbnail image information, including the file size in bytes (if available), height and width in pixels (if available), and the URI to the full-size image or thumbnail',
'instantAnswer'=>'Answers. The result fields returned for requests that specify InstantAnswer vary based on the value or values specified for the Query property. InstantAnswer results can include Encarta, FlightStatus, Finance, Music, Sports, Weather, and Movie ShowTimes. For the Version 2.0 release, results include Encarta and FlightStatus only. Other results are available by invitation',
'mobileWeb'=>'Mobile Web page results (primarily Extensible Hypertext Markup Language (XHTML) and Wireless Markup Language (WML)',
'phoneBook'=>'Results from online White Pages (residential) and Yellow Pages (commercial) entries',
'relatedSearch'=>'Suggestions for other searches related to the query term or terms',
'spell'=>'Spelling suggestions',
'translation'=>'Translated results for a queried',
'video'=>'Video results',*/
//'news'=>'Results from online news services',
);
$source=implode('+', array_keys($tabSources));
$url="http://api.bing.net/json.aspx?AppId=$appId&Version=2.2&Market=$cultureInfo&Query=$query&Sources=$source&Web.Count=$numResults&JsonType=raw";
$page=getUrl($url, '', '', $referer, false);
$json=$page['body'];
$tabJson=json_decode($json, true);
$tabJson=$tabJson['SearchResponse'];
$levMin=100;
$pctMin=0;
$urlLev=$urlPct='';
$urlapprox="http://www.$nomEntrep.fr/";
foreach ($tabJson['Web']['Results'] as $i=> $result) {
$title=utf8_decode($result['Title']); // SCORES & DECISIONS - Accueil
2013-06-19 08:24:49 +00:00
$desc=utf8_decode($result['Description']); // Scores et Décisions - Le nouvel acteur de l'information sur les entreprises et leurs dirigeants avec une approche innovante
$url=$result['Url']; // http://www3.scores-decisions.com/
$lev=@levenshtein ($urlapprox,$url);
if ($lev>0 && $lev<$levMin) {
$levMin=$lev;
$urlLev=$url;
}
$sim=similar_text($urlapprox,$url,$pct);
if ($pct>$pctMin && strpos($url, 'zonebourse')===false) {
$pctMin=$pct;
$urlPct=$url;
}
if (preg_match('/\.(.*\.fr)\//', $url, $matches2)) {
}
$info=parse_url($url);
$host=preg_replace('/\/$/','',$info['host']);
$ext=getFileExtension($host);
$domaine=getFileExtension(preg_replace("/\.$ext$/",'','.'.$host));
2013-06-19 08:24:49 +00:00
echo "RECHERCHE DE '$nomEntrep' ($siren) : Trouv<75> $domaine.$ext".EOL;
if ($ext=='fr') {
$rep=$this->iDb->select('sitesWeb', 'siren, web', "web=$url");
if (@$rep[0]['siren']*1==0) {
$tabAfnic=$this->getInfosAfnic("$domaine.$ext");
$siren=$tabAfnic['siren']*1;
if ($siren>0) {
$tabInsert=array('siren'=>$siren,
'web'=>$url,
'dateInsert'=>date('YmdHis'));
$this->iDb->insert('sitesWeb', $tabInsert);
}
}
}
if ($levMin<15 && $pctMin>44 && $urlLev==$urlPct) {
echo date('Y-m-d H:i:s') .' - '. $page['code'] . " - $rs - $i - $lev (Min=$levMin) - $pct (Min=$pctMin) - $urlLev - $urlPct - $url !!! RETURNED !!!".EOL;
return $urlLev;
}
}
/*
[0] => Array
(
[Title] => SCORES & DECISIONS - Accueil
2013-06-19 08:24:49 +00:00
[Description] => Scores et Décisions - Le nouvel acteur de l'information sur les entreprises et leurs dirigeants avec une approche innovante
[Url] => http://www3.scores-decisions.com/
[CacheUrl] => http://cc.bingj.com/cache.aspx?q=scores+decisions&d=5004075153885515&mkt=fr-FR&w=881d2897,4f2fff68
[DisplayUrl] => www3.scores-decisions.com
[DateTime] => 2011-02-14T12:24:00Z
[DeepLinks] => Array
(
[0] => Array
(
[Title] => Partenaires
[Url] => http://www3.scores-decisions.com/partenaires.php
)
[1] => Array
(
[Title] => Contact
[Url] => http://www3.scores-decisions.com/contact.php
)
)
)
[1] => Array
(
2013-06-19 08:24:49 +00:00
[Title] => SCORES & DECISIONS - Société
[Description] => Scores et Décisions - Le nouvel acteur de l'information sur les entreprises et leurs dirigeants avec une approche innovante
[Url] => http://www3.scores-decisions.com/societe.php
[CacheUrl] => http://cc.bingj.com/cache.aspx?q=scores+decisions&d=4747772983970513&mkt=fr-FR&w=10f5cd33,9b81f773
[DisplayUrl] => www3.scores-decisions.com/societe.php
[DateTime] => 2011-02-13T02:17:00Z
)*/
return false;
}
?>