364 lines
17 KiB
PHP
364 lines
17 KiB
PHP
#!/usr/bin/php -c/var/www/batch/config/php_batch_sd.ini
|
||
<?php
|
||
|
||
include_once(FWK_PATH.'common/chiffres.php');
|
||
include_once(FWK_PATH.'common/dates.php');
|
||
include_once(FWK_PATH.'common/ftp.php');
|
||
include_once(INCLUDE_PATH.'insee/classMInsee.php');
|
||
include_once(INCLUDE_PATH.'partenaires/classMBilans.php');
|
||
include_once(FWK_PATH.'mail/sendMail.php');
|
||
|
||
$dateTime=date('YmdHis');
|
||
|
||
$strInfoScript='Usage : '.basename($argv[0]). " [OPTION]
|
||
Récuperer les logos non encore en base.
|
||
|
||
Options :
|
||
-b Rechercher les logo via Bing
|
||
-v Mode bavard ou debug
|
||
|
||
(*) Option par défaut si aucun argument n'est passé.
|
||
";
|
||
$bing=$modeDebug=false;
|
||
$iReprise=0;
|
||
|
||
for ($i=1; isset($argv[$i]); $i++) {
|
||
if (substr($argv[$i],0,1)=='-') {
|
||
switch (strtolower(substr($argv[$i],1,1))) {
|
||
case 'b': $bing=true; break;
|
||
case 'v': $modeDebug=true; break;
|
||
case '-':
|
||
case '?': die($strInfoScript); break;
|
||
default: die('Option '. $argv[$i] . ' inconnue !'.EOL); break;
|
||
}
|
||
}
|
||
}
|
||
|
||
if ($modeDebug)
|
||
echo date('Y/m/d - H:i:s') ." - DEBUT du programme de récupération des Logos...".EOL;
|
||
|
||
$iDb=new WDB('telephonie');
|
||
|
||
$referer='';
|
||
$url="http://www.annuaire.com/dernieres-societes-inscrites/";
|
||
$tDeb=microtime(true);
|
||
$page=getUrl($url, '', '', $referer, false, '', '', 60);
|
||
$duree=round(microtime(true)-$tDeb,3);
|
||
$body=$page['body'];
|
||
$taille=round(strlen($body)/1024,1);
|
||
/* <li class="even">
|
||
<a href="http://www.annuaire.com/marchand-de-meubles/love-lit-kids-love-lit-kids-528117443/"><img src="http://www.annuaire.com/images/category/picto/13.gif" /></a>
|
||
<strong><a href="http://www.annuaire.com/marchand-de-meubles/love-lit-kids-love-lit-kids-528117443/">LOVE LIT KIDS (LOVE LIT KIDS)</a></strong><br />
|
||
|
||
<p>mobiliers et décorations contemporain et vintage pour chambres d'enfants de...</p>
|
||
</li>
|
||
</ul>
|
||
</div>
|
||
</div>
|
||
<div id="right-space">
|
||
<div class="ad"><script charset="iso-8859-1" type="text/javascript" src="http://adnext.fr/richmedia.adv?popunderid=97318&tag=2&s=big&section=last_register_company"></script></div>
|
||
<div class="side-interpanel-separator"></div>
|
||
|
||
<div class="side-panel">
|
||
<p class="side-panel-title"><img src="http://www.annuaire.com/images/annuaire-rss.png" alt="" /> Sociétés <span class="annuaire_blue">premium</span></p>
|
||
<ul class="side-company-list">
|
||
<li>
|
||
<a href="http://www.annuaire.com/agence-de-publicite/west-development-513135723/"><img src="http://www.annuaire.com/uploads/513/135/513135723/logo.jpg" /></a>
|
||
<strong><a href="http://www.annuaire.com/agence-de-publicite/west-development-513135723/">WEST DEVELOPMENT</a></strong><br />
|
||
|
||
<span class="city annuaire_blue">Vire</span><br />
|
||
<p>WEST DEVELOPMENT le spécialiste en sonorisation, éclairage et instrument de...</p>
|
||
</li>
|
||
<li>*/
|
||
if (!preg_match_all('/<li(?:.*)>(?:\s+)<a href="(http\:\/\/www\.annuaire\.com\/(?:.*)\/)"><img src="(http\:\/\/www\.annuaire\.com\/(.*))" \/><\/a>(?:\s+)<strong><a href=".*">(.*)<\/a><\/strong><br \/>(.*)<\/li>/Uis', $body, $matches))
|
||
die('Erreur de parsing des logos...');
|
||
|
||
foreach ($matches[1] as $i=>$urlFiche) {
|
||
$siren=str_replace('/','',substr($urlFiche,-10));
|
||
$urlLogo=$matches[2][$i];
|
||
/** On télécharge d'abord tous les logos, comme un navigateur **/
|
||
if (substr($urlLogo,0,32)=='http://www.annuaire.com/uploads/') {
|
||
$extension=substr(strrchr($urlLogo,'.'),1);
|
||
if (!file_exists("/home/data/logos/$siren.$extension")) {
|
||
$referer='';
|
||
$tDeb=microtime(true);
|
||
$page=getUrl($urlLogo, '', '', $referer, false, '', '', 60);
|
||
$duree=round(microtime(true)-$tDeb,3);
|
||
$body=$page['body'];
|
||
$taille=round(strlen($body)/1024,1);
|
||
file_put_contents("/home/data/logos/$siren.$extension",$body);
|
||
}
|
||
}
|
||
}
|
||
|
||
foreach ($matches[1] as $i=>$urlFiche) {
|
||
$siren=str_replace('/','',substr($urlFiche,-10));
|
||
$ret=$iDb->select('societe_ent', 'count(*) AS nb', "siren=$siren");
|
||
$nbDeja=$ret[0][0];
|
||
if ($nbDeja==0) {
|
||
$tabEntrep=array('siren'=>$siren);
|
||
$urlLogo=$matches[2][$i];
|
||
/** On télécharge d'abord tous les logos, comme un navigateur **/
|
||
if (substr($urlLogo,0,32)=='http://www.annuaire.com/uploads/' &&
|
||
file_exists("/home/data/logos/$siren.$extension"))
|
||
$tabEntrep['logo']=1;
|
||
else
|
||
$tabEntrep['logo']=0;
|
||
/* $urlLogo=$matches[2][$i];
|
||
if (substr($urlLogo,0,32)<>'http://www.annuaire.com/uploads/') {
|
||
$urlLogo='';
|
||
$tabEntrep['logo']=0;
|
||
}*/
|
||
$tabEntrep['nom']=$nom=$matches[4][$i];
|
||
$tabEntrep['descCourt']=$desc=@trim(html_entity_decode(strip_tags($matches[5][$i])));
|
||
|
||
/** Téléchargement du logo **
|
||
if ($urlLogo<>'') {
|
||
$extension=substr(strrchr($urlLogo,'.'),1);
|
||
if (!file_exists("/home/data/logos/$siren.$extension")) {
|
||
$referer='';
|
||
$tDeb=microtime(true);
|
||
$page=getUrl($urlLogo, '', '', $referer, false, '', '', 60);
|
||
$duree=round(microtime(true)-$tDeb,3);
|
||
$body=$page['body'];
|
||
$taille=round(strlen($body)/1024,1);
|
||
if (file_put_contents("/home/data/logos/$siren.$extension",$body))
|
||
$tabEntrep['logo']=1;
|
||
} else
|
||
$tabEntrep['logo']=1;
|
||
}*/
|
||
$desc=$categ=$web='';
|
||
if ($i>0) {
|
||
/** Lecture de la fiche **/
|
||
$referer='';
|
||
$tDeb=microtime(true);
|
||
$page=getUrl($urlFiche, '', '', $referer, false, '', '', 60);
|
||
$duree=round(microtime(true)-$tDeb,3);
|
||
$body=$page['body'];
|
||
if (preg_match_all('/<li class="tel">(?:\s+)<label class="type" title="(.*)"(?:\s+)?>(.*)<\/label>(?:\s+)<span class="dot">\:<\/span>(?:\s+)<span class="value"(?:.*)?>(.*)<\/span>(.*)<\/li>/Uis', $body, $matches2)) {
|
||
//print_r($matches2);
|
||
foreach ($matches2[1] as $j=>$typeTel) {
|
||
$libTel=html_entity_decode($matches2[2][$j]);
|
||
$numTel=trim($matches2[3][$j]);
|
||
$infoTel=html_entity_decode(trim(strtr(strip_tags($matches2[4][$j]),array('('=>'',')'=>''))));
|
||
$tabInsert=array('siren'=>$siren,
|
||
'typeTel'=>$typeTel,
|
||
'libTel'=>$libTel,
|
||
'numTel'=>$numTel,
|
||
'infoTel'=>$infoTel,
|
||
'dateInsert'=>$dateTime,
|
||
);
|
||
if ($iDb->insert('societe_tel', $tabInsert))
|
||
echo "$dateTime\t$siren\t$nom\t$typeTel\t$libTel\t$numTel\t$infoTel".EOL;
|
||
}
|
||
}
|
||
$desc=$categ=$web='';
|
||
|
||
if (preg_match('/<div id="description">(?:\s+)<label class="description">(?:.*)<\/span>(?:\s+)<p>(.*)<\/p>(?:\s+)<\/div>/Uis', $body, $matches2))
|
||
$tabEntrep['descLong']=@trim(html_entity_decode(strip_tags($matches2[1])));
|
||
if (preg_match('/<div id="category">(?:\s+)<label>(?:.*)<\/span>(?:\s+)<p class="(.*)">(.*)<\/p>/Uis', $body, $matches2)) {
|
||
$tabEntrep['catTyp']=@trim($matches2[1]);
|
||
$tabEntrep['catLib']=$categ=@trim($matches2[2]);
|
||
}
|
||
if (preg_match('/<div id="site">(?:\s+)<label>(?:.*)<\/span>(?:\s+)<a class="url" target="_blank" href="(.*)">(?:.*)<\/div>/Uis', $body, $matches2))
|
||
$tabEntrep['web']=$web=@trim($matches2[1]);
|
||
|
||
// if (preg_match('/<div id="mail">(?:\s+)<label>(?:.*)<\/span>(?:\s+)<span class="mail"><img src="http://www.annuaire.com/images/email/48294778500000/" /></span><a class="url" target="_blank" href="(.*)">(?:.*)<\/div>', $body, $matches2))
|
||
// $web=$matches2[2];
|
||
}
|
||
$tabEntrep['dateInsert']=$dateTime;
|
||
if ($iDb->insert('societe_ent', $tabEntrep))
|
||
echo "$dateTime\t$siren\t$nom\t$urlLogo\t$desc\t$categ\t$web".EOL;
|
||
}
|
||
}
|
||
//print_r($matches);
|
||
if ($modeDebug)
|
||
echo date('Y/m/d - H:i:s') ." - FIN du programme de récupération des Logos.".EOL;
|
||
|
||
if (!$bing) die();
|
||
// Recherche BING
|
||
$appId = '56D6CBA671C986D3EA11B1B48F97507BC5A00D51';
|
||
$numResults = 50;
|
||
$cultureInfo = 'fr-FR';
|
||
|
||
for($i=0; $i<1000; $i++) {
|
||
// for($j=522; $j<1000; $j++) {
|
||
// $siren2=implode(' ', str_split($siren, 3));
|
||
// if ($nomEntrep<>'') $rs2="OR \"$nomEntrep\"";
|
||
// $tabSitesExclus=array('societe.com','bilans.net','gouv.fr','info-financiere.fr','bodacc.fr','manageo.fr','bilansgratuits.fr','lesechos.fr','google.fr');
|
||
$sir1=sprintf("%03s", $i);
|
||
//$sir2=sprintf("%03s", $j);
|
||
$rs="$sir1 logo site:annuaire.com";
|
||
//511
|
||
// $rs="$siren OR \"$siren2\" $rs2 -site:".implode(' -site:', $tabSitesExclus);
|
||
$query=stripslashes(urlencode($rs));
|
||
|
||
$tabSources=array(// 'web'=>'Web page results',
|
||
'image'=>'Full-size image and thumbnail image information, including the file size in bytes (if available), height and width in pixels (if available), and the URI to the full-size image or thumbnail',
|
||
/*'instantAnswer'=>'Answers. The result fields returned for requests that specify InstantAnswer vary based on the value or values specified for the Query property. InstantAnswer results can include Encarta, FlightStatus, Finance, Music, Sports, Weather, and Movie ShowTimes. For the Version 2.0 release, results include Encarta and FlightStatus only. Other results are available by invitation',
|
||
'mobileWeb'=>'Mobile Web page results (primarily Extensible Hypertext Markup Language (XHTML) and Wireless Markup Language (WML)',
|
||
'phoneBook'=>'Results from online White Pages (residential) and Yellow Pages (commercial) entries',
|
||
'relatedSearch'=>'Suggestions for other searches related to the query term or terms',
|
||
'spell'=>'Spelling suggestions',
|
||
'translation'=>'Translated results for a queried',
|
||
'video'=>'Video results',*/
|
||
//'news'=>'Results from online news services',
|
||
);
|
||
$source=implode('+', array_keys($tabSources));
|
||
$offset=0;
|
||
while (true) {
|
||
$url="http://api.bing.net/json.aspx?AppId=$appId&Version=2.2&Market=$cultureInfo&Query=$query&Sources=$source&Image.Count=$numResults&Image.Offset=$offset&JsonType=raw";//&Adult=On";
|
||
$page=getUrl($url, '', '', $referer, false);
|
||
print_r($page);
|
||
die();//Mentions légales L’annonceur est l’éditeur de ce site. PagesJaunes est le prestataire technique.
|
||
$json=$page['body'];
|
||
$tabJson=json_decode($json, true);
|
||
$tabJson=$tabJson['SearchResponse'];
|
||
|
||
foreach ($tabJson['Image']['Results'] as $j=>$tab) {
|
||
//print_r($tab);
|
||
$urlLogo=$tab['MediaUrl'];
|
||
$tmp=explode('/',str_replace('http://www.annuaire.com/uploads/','',$urlLogo));
|
||
$siren=$tmp[2];
|
||
if (substr($urlLogo,0,32)=='http://www.annuaire.com/uploads/') {
|
||
$extension=substr(strrchr($urlLogo,'.'),1);
|
||
if (!file_exists("/home/data/logos/$siren.$extension")) {
|
||
$referer='';
|
||
$tDeb=microtime(true);
|
||
$page=getUrl($urlLogo, '', '', $referer, false, '', '', 60);
|
||
$duree=round(microtime(true)-$tDeb,3);
|
||
$body=$page['body'];
|
||
$taille=round(strlen($body)/1024,1);
|
||
file_put_contents("/home/data/logos/$siren.$extension",$body);
|
||
echo "$sir1\t$offset\t$siren\t$urlLogo".EOL;
|
||
randsleep(1,2);
|
||
}
|
||
}
|
||
}
|
||
$offset+=$numResults;
|
||
if ($offset>$tabJson['Image']['Total']) break;
|
||
}
|
||
}
|
||
die();
|
||
|
||
function findSiteWeb($siren, $nomEntrep='') {
|
||
// Recherche BING
|
||
$appId = '56D6CBA671C986D3EA11B1B48F97507BC5A00D51';
|
||
$numResults = 50;
|
||
$cultureInfo = 'fr-FR';
|
||
|
||
$siren2=implode(' ', str_split($siren, 3));
|
||
if ($nomEntrep<>'') $rs2="OR \"$nomEntrep\"";
|
||
$tabSitesExclus=array('societe.com','bilans.net','gouv.fr','info-financiere.fr','bodacc.fr','manageo.fr','bilansgratuits.fr','lesechos.fr','google.fr');
|
||
|
||
$rs="$siren OR \"$siren2\" $rs2 -site:".implode(' -site:', $tabSitesExclus);
|
||
|
||
$query=stripslashes(urlencode($rs));
|
||
|
||
$tabSources=array( 'web'=>'Web page results',
|
||
/*'image'=>'Full-size image and thumbnail image information, including the file size in bytes (if available), height and width in pixels (if available), and the URI to the full-size image or thumbnail',
|
||
'instantAnswer'=>'Answers. The result fields returned for requests that specify InstantAnswer vary based on the value or values specified for the Query property. InstantAnswer results can include Encarta, FlightStatus, Finance, Music, Sports, Weather, and Movie ShowTimes. For the Version 2.0 release, results include Encarta and FlightStatus only. Other results are available by invitation',
|
||
'mobileWeb'=>'Mobile Web page results (primarily Extensible Hypertext Markup Language (XHTML) and Wireless Markup Language (WML)',
|
||
'phoneBook'=>'Results from online White Pages (residential) and Yellow Pages (commercial) entries',
|
||
'relatedSearch'=>'Suggestions for other searches related to the query term or terms',
|
||
'spell'=>'Spelling suggestions',
|
||
'translation'=>'Translated results for a queried',
|
||
'video'=>'Video results',*/
|
||
//'news'=>'Results from online news services',
|
||
);
|
||
$source=implode('+', array_keys($tabSources));
|
||
|
||
$url="http://api.bing.net/json.aspx?AppId=$appId&Version=2.2&Market=$cultureInfo&Query=$query&Sources=$source&Web.Count=$numResults&JsonType=raw";
|
||
$page=getUrl($url, '', '', $referer, false);
|
||
$json=$page['body'];
|
||
|
||
$tabJson=json_decode($json, true);
|
||
$tabJson=$tabJson['SearchResponse'];
|
||
|
||
$levMin=100;
|
||
$pctMin=0;
|
||
$urlLev=$urlPct='';
|
||
$urlapprox="http://www.$nomEntrep.fr/";
|
||
|
||
foreach ($tabJson['Web']['Results'] as $i=> $result) {
|
||
|
||
$title=utf8_decode($result['Title']); // SCORES & DECISIONS - Accueil
|
||
$desc=utf8_decode($result['Description']); // Scores et Décisions - Le nouvel acteur de l'information sur les entreprises et leurs dirigeants avec une approche innovante
|
||
$url=$result['Url']; // http://www3.scores-decisions.com/
|
||
|
||
$lev=@levenshtein ($urlapprox,$url);
|
||
if ($lev>0 && $lev<$levMin) {
|
||
$levMin=$lev;
|
||
$urlLev=$url;
|
||
}
|
||
$sim=similar_text($urlapprox,$url,$pct);
|
||
if ($pct>$pctMin && strpos($url, 'zonebourse')===false) {
|
||
$pctMin=$pct;
|
||
$urlPct=$url;
|
||
}
|
||
if (preg_match('/\.(.*\.fr)\//', $url, $matches2)) {
|
||
}
|
||
|
||
$info=parse_url($url);
|
||
$host=preg_replace('/\/$/','',$info['host']);
|
||
$ext=getFileExtension($host);
|
||
$domaine=getFileExtension(preg_replace("/\.$ext$/",'','.'.$host));
|
||
echo "RECHERCHE DE '$nomEntrep' ($siren) : Trouvé $domaine.$ext".EOL;
|
||
if ($ext=='fr') {
|
||
$rep=$this->iDb->select('sitesWeb', 'siren, web', "web=$url");
|
||
if (@$rep[0]['siren']*1==0) {
|
||
$tabAfnic=$this->getInfosAfnic("$domaine.$ext");
|
||
$siren=$tabAfnic['siren']*1;
|
||
if ($siren>0) {
|
||
$tabInsert=array('siren'=>$siren,
|
||
'web'=>$url,
|
||
'dateInsert'=>date('YmdHis'));
|
||
$this->iDb->insert('sitesWeb', $tabInsert);
|
||
}
|
||
}
|
||
}
|
||
|
||
if ($levMin<15 && $pctMin>44 && $urlLev==$urlPct) {
|
||
echo date('Y-m-d H:i:s') .' - '. $page['code'] . " - $rs - $i - $lev (Min=$levMin) - $pct (Min=$pctMin) - $urlLev - $urlPct - $url !!! RETURNED !!!".EOL;
|
||
return $urlLev;
|
||
}
|
||
|
||
}
|
||
/*
|
||
[0] => Array
|
||
(
|
||
[Title] => SCORES & DECISIONS - Accueil
|
||
[Description] => Scores et Décisions - Le nouvel acteur de l'information sur les entreprises et leurs dirigeants avec une approche innovante
|
||
[Url] => http://www3.scores-decisions.com/
|
||
[CacheUrl] => http://cc.bingj.com/cache.aspx?q=scores+decisions&d=5004075153885515&mkt=fr-FR&w=881d2897,4f2fff68
|
||
[DisplayUrl] => www3.scores-decisions.com
|
||
[DateTime] => 2011-02-14T12:24:00Z
|
||
[DeepLinks] => Array
|
||
(
|
||
[0] => Array
|
||
(
|
||
[Title] => Partenaires
|
||
[Url] => http://www3.scores-decisions.com/partenaires.php
|
||
)
|
||
[1] => Array
|
||
(
|
||
[Title] => Contact
|
||
[Url] => http://www3.scores-decisions.com/contact.php
|
||
)
|
||
)
|
||
)
|
||
[1] => Array
|
||
(
|
||
[Title] => SCORES & DECISIONS - Société
|
||
[Description] => Scores et Décisions - Le nouvel acteur de l'information sur les entreprises et leurs dirigeants avec une approche innovante
|
||
[Url] => http://www3.scores-decisions.com/societe.php
|
||
[CacheUrl] => http://cc.bingj.com/cache.aspx?q=scores+decisions&d=4747772983970513&mkt=fr-FR&w=10f5cd33,9b81f773
|
||
[DisplayUrl] => www3.scores-decisions.com/societe.php
|
||
[DateTime] => 2011-02-13T02:17:00Z
|
||
)*/
|
||
return false;
|
||
}
|
||
|
||
|
||
?>
|