df662e4369
- meilleur gestion des artisans du RM / infos complémentaires (activite, dates et lieu de naissance) - gestion des entreprise en plan de redressement ou continuation : Affichage de la situation juridique "Plan en cours" en bleu sur la fiche d'identite et suppression des En procol si révolue ou inaproprié (réimmatriculation PP après clôture) - suppression/duplication des liens annonces associations, collecte et bodacc... - mise en place de file d'attente empechant les verrous sur la base et les erreurs ou lenteurs sur l'extranet - Ajout/correction de ratios - Correction de commentaires/score
772 lines
30 KiB
PHP
772 lines
30 KiB
PHP
<?
|
||
|
||
class MSitesWeb {
|
||
|
||
private $whois_serveurs = array(
|
||
"ac" => "whois.nic.ac",
|
||
"al" => "whois.ripe.net",
|
||
"am" => "whois.amnic.net",
|
||
"as" => "whois.nic.as",
|
||
"at" => "whois.ripe.net",
|
||
"au" => "whois.aunic.net",
|
||
"az" => "whois.ripe.net",
|
||
"ba" => "whois.ripe.net",
|
||
"be" => "whois.ripe.net",
|
||
"bg" => "whois.ripe.net",
|
||
"biz" => "whois.nic.biz",
|
||
"br" => "whois.registro.br",
|
||
"by" => "whois.ripe.net",
|
||
"ca" => "whois.cira.ca",
|
||
"cc" => "whois.nic.cc",
|
||
"ch" => "whois.nic.ch",
|
||
"ck" => "whois.ck-nic.org.ck",
|
||
"cn" => "whois.cnnic.net.cn",
|
||
"com" => "whois.crsnic.net",
|
||
"cx" => "whois.nic.cx",
|
||
"cy" => "whois.ripe.net",
|
||
"cz" => "whois.nic.cz",
|
||
"de" => "whois.denic.de",
|
||
"dk" => "whois.dk-hostmaster.dk",
|
||
"dz" => "whois.ripe.net",
|
||
"edu" => "rs.internic.net",
|
||
"ee" => "whois.ripe.net",
|
||
"eg" => "whois.ripe.net",
|
||
"es" => "whois.ripe.net",
|
||
"eu" => "whois.eu",
|
||
"fi" => "whois.ripe.net",
|
||
"fj" => "whois.usp.ac.fj",
|
||
"fo" => "whois.ripe.net",
|
||
"fr" => "whois.nic.fr",
|
||
"gb" => "whois.ripe.net",
|
||
"ge" => "whois.ripe.net",
|
||
"gov" => "whois.nic.gov",
|
||
"gr" => "whois.ripe.net",
|
||
"gs" => "whois.adamsnames.tc",
|
||
"hk" => "whois.hknic.net.hk",
|
||
"hm" => "whois.registry.hm",
|
||
"hr" => "whois.ripe.net",
|
||
"hu" => "whois.ripe.net",
|
||
"id" => "whois.idnic.net.id",
|
||
"ie" => "whois.domainregistry.ie",
|
||
"info" => "whois.afilias.net",
|
||
"int" => "whois.isi.edu",
|
||
"il" => "whois.ripe.net",
|
||
"is" => "whois.isnet.is",
|
||
"it" => "whois.nic.it",
|
||
"jp" => "whois.nic.ad.jp",
|
||
"ke" => "whois.rg.net",
|
||
"kg" => "whois.domain.kg",
|
||
"kr" => "whois.nic.or.kr",
|
||
"kz" => "whois.domain.kz",
|
||
"li" => "whois.nic.li",
|
||
"lk" => "whois.nic.lk",
|
||
"lt" => "whois.ripe.net",
|
||
"lu" => "whois.ripe.net",
|
||
"lv" => "whois.ripe.net",
|
||
"ma" => "whois.ripe.net",
|
||
"md" => "whois.ripe.net",
|
||
"mil" => "whois.nic.mil",
|
||
"mk" => "whois.ripe.net",
|
||
"mm" => "whois.nic.mm",
|
||
"ms" => "whois.adamsnames.tc",
|
||
"mt" => "whois.ripe.net",
|
||
"mx" => "whois.nic.mx",
|
||
"net" => "rs.internic.net",
|
||
"nl" => "whois.domain-registry.nl",
|
||
"no" => "whois.norid.no",
|
||
"nu" => "whois.nic.nu",
|
||
"nz" => "whois.domainz.net.nz",
|
||
"org" => "whois.pir.org",
|
||
"pl" => "whois.ripe.net",
|
||
"pk" => "whois.pknic.net.pk",
|
||
"pt" => "whois.ripe.net",
|
||
"ro" => "whois.ripe.net",
|
||
"ru" => "whois.ripn.ru",
|
||
"se" => "whois.nic-se.se",
|
||
"sg" => "whois.nic.net.sg",
|
||
"si" => "whois.ripe.net",
|
||
"sh" => "whois.nic.sh",
|
||
"sk" => "whois.ripe.net",
|
||
"sm" => "whois.ripe.net",
|
||
"st" => "whois.nic.st",
|
||
"su" => "whois.ripe.net",
|
||
"tc" => "whois.adamsnames.tc",
|
||
"tf" => "whois.adamsnames.tc",
|
||
"tj" => "whois.nic.tj",
|
||
"th" => "whois.thnic.net",
|
||
"tm" => "whois.nic.tm",
|
||
"tn" => "whois.ripe.net",
|
||
"to" => "whois.tonic.to",
|
||
"tr" => "whois.ripe.net",
|
||
"tw" => "whois.twnic.net",
|
||
"ua" => "whois.ripe.net",
|
||
"uk" => "whois.nic.uk",
|
||
"us" => "whois.isi.edu",
|
||
"va" => "whois.ripe.net",
|
||
"vg" => "whois.adamsnames.tc",
|
||
"ws" => "whois.nic.ws",
|
||
"yu" => "whois.ripe.net",
|
||
"za" => "whois.frd.ac.za");
|
||
|
||
private $iInsee;
|
||
private $iDb;
|
||
|
||
function __construct(/*$siren, $accesDist=true*/) {
|
||
$this->iInsee=new MInsee();
|
||
$this->iDb=new WDB('jo');
|
||
}
|
||
|
||
/** L'adresse IP est elle valide ?
|
||
* @param $ip Adresse IP v4
|
||
*/
|
||
function isIpValide($ip) {
|
||
$ip_explode = explode('.',$ip);
|
||
$nb_valide = 0;
|
||
foreach ($ip_explode as $element)
|
||
if ($element>=0 && $element<255)
|
||
$nb_valide++;
|
||
if ($nb_valide==4) return true;
|
||
return false;
|
||
}
|
||
|
||
function getInfosSiteWeb($url) {
|
||
$tabRet=array('url'=>$url, 'url_valide'=>0);
|
||
// Est-ce une URL valide en param<61>tre ?
|
||
if(preg_match('|^(http(s)?://)?[a-z0-9-]+(.[a-z0-9-]+)*(:[0-9]+)?(/.*)?$|i', $url)) {
|
||
//$tabRet['url']=preg_replace('/^http\/\//', 'http://', $url);
|
||
$tabRet['url_valide']=1;
|
||
$info=parse_url($url);
|
||
$host=@preg_replace('/\/$/','',$info['host']);
|
||
$ext=getFileExtension($host);
|
||
$domaine=getFileExtension(preg_replace("/\.$ext$/",'','.'.$host));
|
||
$tabRet['url_scheme']=@$info['scheme'];
|
||
$tabRet['url_host']=@$info['host'];
|
||
$tabRet['url_path']=@$info['path'];
|
||
$tabRet['url_query']=@$info['query'];
|
||
$tabRet['url_fragment']=@$info['fragment'];
|
||
|
||
$tabRet['domaine']="$domaine.$ext";
|
||
$tabRet['domaine_valide']=0;
|
||
// Le nom de domaine est il actif ?
|
||
if (checkdnsrr("$domaine.$ext")) {
|
||
$tabRet['domaine_valide']=1;
|
||
$ip=gethostbyname("$domaine.$ext");
|
||
if ($this->isIpValide($ip)) $tabRet['ip']=$ip;
|
||
$page=getUrl("http://$host", '', '', '', false, "$domaine.$ext", '', 3, 0);
|
||
// Le site est il actif ?
|
||
if ($page['code']==400 || $page['code']==408) {
|
||
$tabRet['erreur_num']=@$page['header']['curl_errno'];
|
||
$tabRet['erreur_txt']=@$page['header']['curl_error'];
|
||
} else {
|
||
/** @todo V<>rifier les pages de parking et les sites non encore termin<69>s **/
|
||
$tabRet['code']=$page['code'];
|
||
//$tabRet['header']=$page['header'];
|
||
$tabRet['header_server']=trim($page['header']['Server']);
|
||
$tabRet['header_location']=trim($page['header']['Location']);
|
||
$tabRet['header_powerby']=trim($page['header']['X-Powered-By']);
|
||
$tabRet['header_content']=trim($page['header']['Content-Type']);
|
||
$tabRet['html_size']=strlen($page['body']);
|
||
if (preg_match('/<head>(.*)<\/head>/Uis',$page['body'],$matches)) {
|
||
$html_head=$matches[1];
|
||
$tabRet['html_head']=$html_head;
|
||
if (preg_match('/<title>(.*)<\/title>/Uis',$html_head,$matches))
|
||
$tabRet['html_title']=utf8_decode(trim($matches[1]));
|
||
if (preg_match_all('/<meta\s+name(?:\s+|)=(.*)\scontent(?:\s+|)=(.*)>/Uis',$html_head,$matches)) {
|
||
foreach ($matches[1] as $i=>$metaName) {
|
||
$meta=strtolower(trim(strtr($metaName, array('"'=>'', "'"=>''))));
|
||
$content=utf8_decode(preg_replace('/^(?:\s+|)("|\')/Uis','',
|
||
preg_replace('/("|\')(?:\s+|)(?:\/|)$/Uis','',
|
||
$matches[2][$i])));
|
||
switch ($meta) {
|
||
case 'title':
|
||
$meta='metatitle';
|
||
case 'content-language':
|
||
$meta='language';
|
||
case 'languages':
|
||
$meta='language';
|
||
case 'language':
|
||
case 'revisit-after':
|
||
case 'coverage':
|
||
case 'copyright':
|
||
case 'author':
|
||
case 'rating':
|
||
case 'resource-type':
|
||
case 'classification':
|
||
case 'distribution':
|
||
case 'doc-rights':
|
||
case 'doc-type':
|
||
case 'robots':
|
||
case 'keywords':
|
||
case 'description':
|
||
case 'generator':
|
||
case 'category':
|
||
case 'owner':
|
||
case 'identifier-url':
|
||
$tabRet['html_'.$meta]=utf8_decode(trim($content));
|
||
break;
|
||
default:
|
||
echo "$meta='$content'".EOL;
|
||
break;
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
return $tabRet;
|
||
}
|
||
|
||
function getInfosAfnic($siteWeb) {
|
||
$referer='http://www.afnic.fr/';
|
||
$url='http://www.afnic.fr/outils/whois/'.$siteWeb;
|
||
$page=getUrl($url, '', '', $referer);
|
||
if ($page['code']==200) {
|
||
$tabRet=array();
|
||
if (preg_match('/<b>Bureau d’enregistrement \: <\/b>.*">(.*)<\/a><br>/Uis',$page['body'],$matches))
|
||
$tabRet['registrar']=$matches[1];
|
||
if (preg_match('/<\/noscript><b>État \:<\/b>(.*)<\/b>/Uis',$page['body'],$matches)) {
|
||
$tabRet['etat']=trim($matches[1]); /*
|
||
Actif (consultez aussi le <b><a href="http://www.decideo.fr" class="ext" target="_BLANC">Site web</a>*/
|
||
if (preg_match('/<b><a href="(.*)"/Uis',$tabRet['etat'],$matches))
|
||
$tabRet['siteWeb']=trim($matches[1]);
|
||
$tmp=explode(' (', $tabRet['etat']);
|
||
$tabRet['etat']=$tmp[0];
|
||
}
|
||
if (preg_match('/<b>Date de cr<63>ation \: <\/b>(.*)<br>/Uis',$page['body'],$matches))
|
||
$tabRet['dateCrea']=$matches[1];
|
||
if (preg_match('/<b>Date anniversaire \: <\/b>(.*)<br>/Uis',$page['body'],$matches))
|
||
$tabRet['dateAnniv']=trim($matches[1]);
|
||
|
||
if (preg_match("/<span class=h1>Titulaire \: <\/span>(.*)<br><div style='clear\: both;'><\/div>/Uis",$page['body'],$matches)) {
|
||
$strTitu=trim($matches[1]);
|
||
if (preg_match("/<span class=bleuvif>(.*)<\/span><\/h2>/Uis",$strTitu,$matches))
|
||
$tabRet['tituNom']=trim($matches[1]);
|
||
if (preg_match("/<noscript><div id='Layer2' style='display\: block;'><\/noscript>(.*)<b>/Uis",$strTitu,$matches)) {
|
||
$tabRet['tituAdr']=strip_tags(trim($matches[1]));
|
||
$tmp=$this->iInsee->structureVoie($tabRet['tituAdr']);
|
||
$tabRet['tituAdrNum']=$tmp['num'];
|
||
$tabRet['tituAdrCp']=$tmp['cp'];
|
||
$tabRet['tituAdrVille']=preg_replace('/ FRANCE$/','',trim($tmp['ville']));
|
||
$tabRet['tituAdrTypVoie']=$tmp['typeVoie'];
|
||
$tabRet['tituAdrLibVoie']=$tmp['libVoie'];
|
||
$tabRet['tituAdrComp']=$tmp['adrComp0'];
|
||
}
|
||
if (preg_match("/<b>T<>l<EFBFBD>phone \: <\/b>(.*)<br>/Uis",$strTitu,$matches))
|
||
$tabRet['tituTel']=trim($matches[1]);
|
||
if (preg_match('/<b>Courrier <20>lectronique \: <\/b> <a href="mailto:(.*)">/Uis',$strTitu,$matches))
|
||
$tabRet['tituMel']=trim($matches[1]);
|
||
}
|
||
|
||
if(preg_match('/<li>Identifi<66> gr<67>ce au num<75>ro de SIREN <a href="(?:.*) target="_blank" class="ext">(.*)<\/a><\/li>/Uis', $page['body'], $matches))
|
||
$tabRet['siren']=html_entity_decode($matches[1]);
|
||
elseif (preg_match('/<li>Identifi<66> gr<67>ce <20> la marque fran<61>aise \((.*)\) <\/li>/Uis', $page['body'], $matches))
|
||
$tabRet['marquefr']=html_entity_decode($matches[1]);
|
||
} else
|
||
return $page['code'];
|
||
|
||
return $tabRet;
|
||
}
|
||
|
||
function findSiteWeb($siren, $nomEntrep='') {
|
||
// Recherche BING
|
||
$appId = '56D6CBA671C986D3EA11B1B48F97507BC5A00D51';
|
||
$numResults = 50;
|
||
$cultureInfo = 'fr-FR';
|
||
|
||
$siren2=implode(' ', str_split($siren, 3));
|
||
if ($nomEntrep<>'') $rs2="OR \"$nomEntrep\"";
|
||
$tabSitesExclus=array('societe.com','bilans.net','gouv.fr','info-financiere.fr','bodacc.fr','manageo.fr','bilansgratuits.fr','lesechos.fr','google.fr');
|
||
|
||
$rs="$siren OR \"$siren2\" $rs2 -site:".implode(' -site:', $tabSitesExclus);
|
||
|
||
$query=stripslashes(urlencode($rs));
|
||
|
||
$tabSources=array( 'web'=>'Web page results',
|
||
/*'image'=>'Full-size image and thumbnail image information, including the file size in bytes (if available), height and width in pixels (if available), and the URI to the full-size image or thumbnail',
|
||
'instantAnswer'=>'Answers. The result fields returned for requests that specify InstantAnswer vary based on the value or values specified for the Query property. InstantAnswer results can include Encarta, FlightStatus, Finance, Music, Sports, Weather, and Movie ShowTimes. For the Version 2.0 release, results include Encarta and FlightStatus only. Other results are available by invitation',
|
||
'mobileWeb'=>'Mobile Web page results (primarily Extensible Hypertext Markup Language (XHTML) and Wireless Markup Language (WML)',
|
||
'phoneBook'=>'Results from online White Pages (residential) and Yellow Pages (commercial) entries',
|
||
'relatedSearch'=>'Suggestions for other searches related to the query term or terms',
|
||
'spell'=>'Spelling suggestions',
|
||
'translation'=>'Translated results for a queried',
|
||
'video'=>'Video results',*/
|
||
//'news'=>'Results from online news services',
|
||
);
|
||
$source=implode('+', array_keys($tabSources));
|
||
|
||
$url="http://api.bing.net/json.aspx?AppId=$appId&Version=2.2&Market=$cultureInfo&Query=$query&Sources=$source&Web.Count=$numResults&JsonType=raw";
|
||
$page=getUrl($url, '', '', $referer, false);
|
||
$json=$page['body'];
|
||
|
||
$tabJson=json_decode($json, true);
|
||
$tabJson=$tabJson['SearchResponse'];
|
||
|
||
$levMin=100;
|
||
$pctMin=0;
|
||
$urlLev=$urlPct='';
|
||
$urlapprox="http://www.$nomEntrep.fr/";
|
||
|
||
foreach ($tabJson['Web']['Results'] as $i=> $result) {
|
||
|
||
$title=utf8_decode($result['Title']); // SCORES & DECISIONS - Accueil
|
||
$desc=utf8_decode($result['Description']); // Scores et Décisions - Le nouvel acteur de l'information sur les entreprises et leurs dirigeants avec une approche innovante
|
||
$url=$result['Url']; // http://www3.scores-decisions.com/
|
||
|
||
$lev=@levenshtein ($urlapprox,$url);
|
||
if ($lev>0 && $lev<$levMin) {
|
||
$levMin=$lev;
|
||
$urlLev=$url;
|
||
}
|
||
$sim=similar_text($urlapprox,$url,$pct);
|
||
if ($pct>$pctMin && strpos($url, 'zonebourse')===false) {
|
||
$pctMin=$pct;
|
||
$urlPct=$url;
|
||
}
|
||
if (preg_match('/\.(.*\.fr)\//', $url, $matches2)) {
|
||
}
|
||
|
||
$info=parse_url($url);
|
||
$host=preg_replace('/\/$/','',$info['host']);
|
||
$ext=getFileExtension($host);
|
||
$domaine=getFileExtension(preg_replace("/\.$ext$/",'','.'.$host));
|
||
echo "RECHERCHE DE '$nomEntrep' ($siren) : Trouv<EFBFBD> $domaine.$ext".EOL;
|
||
if ($ext=='fr') {
|
||
$rep=$this->iDb->select('sitesWeb', 'siren, web', "web=$url");
|
||
if (@$rep[0]['siren']*1==0) {
|
||
$tabAfnic=$this->getInfosAfnic("$domaine.$ext");
|
||
$siren=$tabAfnic['siren']*1;
|
||
if ($siren>0) {
|
||
$tabInsert=array('siren'=>$siren,
|
||
'web'=>$url,
|
||
'dateInsert'=>date('YmdHis'));
|
||
$this->iDb->insert('sitesWeb', $tabInsert);
|
||
}
|
||
}
|
||
}
|
||
|
||
if ($levMin<15 && $pctMin>44 && $urlLev==$urlPct) {
|
||
echo date('Y-m-d H:i:s') .' - '. $page['code'] . " - $rs - $i - $lev (Min=$levMin) - $pct (Min=$pctMin) - $urlLev - $urlPct - $url !!! RETURNED !!!".EOL;
|
||
return $urlLev;
|
||
}
|
||
|
||
}
|
||
/*
|
||
[0] => Array
|
||
(
|
||
[Title] => SCORES & DECISIONS - Accueil
|
||
[Description] => Scores et Décisions - Le nouvel acteur de l'information sur les entreprises et leurs dirigeants avec une approche innovante
|
||
[Url] => http://www3.scores-decisions.com/
|
||
[CacheUrl] => http://cc.bingj.com/cache.aspx?q=scores+decisions&d=5004075153885515&mkt=fr-FR&w=881d2897,4f2fff68
|
||
[DisplayUrl] => www3.scores-decisions.com
|
||
[DateTime] => 2011-02-14T12:24:00Z
|
||
[DeepLinks] => Array
|
||
(
|
||
[0] => Array
|
||
(
|
||
[Title] => Partenaires
|
||
[Url] => http://www3.scores-decisions.com/partenaires.php
|
||
)
|
||
[1] => Array
|
||
(
|
||
[Title] => Contact
|
||
[Url] => http://www3.scores-decisions.com/contact.php
|
||
)
|
||
)
|
||
)
|
||
[1] => Array
|
||
(
|
||
[Title] => SCORES & DECISIONS - Société
|
||
[Description] => Scores et Décisions - Le nouvel acteur de l'information sur les entreprises et leurs dirigeants avec une approche innovante
|
||
[Url] => http://www3.scores-decisions.com/societe.php
|
||
[CacheUrl] => http://cc.bingj.com/cache.aspx?q=scores+decisions&d=4747772983970513&mkt=fr-FR&w=10f5cd33,9b81f773
|
||
[DisplayUrl] => www3.scores-decisions.com/societe.php
|
||
[DateTime] => 2011-02-13T02:17:00Z
|
||
)*/
|
||
return false;
|
||
}
|
||
|
||
function whois($domaine) {
|
||
$parseur=explode(".", $domaine);
|
||
$hote=$this->whois_serveurs[strtolower($parseur[count($parseur)-1])];
|
||
$msg='';
|
||
|
||
if (empty($hote)) {
|
||
$msg="Extension du domaine '$domaine' inconnue";
|
||
} else {
|
||
$fp = fsockopen($hote, 43, $errno, $errstr, 10);
|
||
if (!$fp) {
|
||
$msg="Erreur de socket no$errno : $errstr";
|
||
} else {
|
||
fputs($fp, $domaine . "\r\n");
|
||
$buf=$server='';
|
||
while (!feof($fp)) {
|
||
$row=fgets($fp, 128);
|
||
$buf.=$row;
|
||
if (eregi("Whois Server:", $row))
|
||
$server = trim(str_replace('Whois Server:', '', $row));
|
||
}
|
||
fclose($fp);
|
||
if (ereg("No match for", $buf) ||
|
||
ereg("NOT FOUND", $buf) ||
|
||
ereg("Status: FREE", $buf) ||
|
||
ereg("No entries found", $buf) ||
|
||
ereg("Not found", $buf) ||
|
||
ereg("AVAIL", $buf)) {
|
||
$msg="Domaine '$domaine' libre";
|
||
} else {
|
||
//echo "<p><strong>Le nom de domaine <font color=\"red\">" . $_POST['domaine'] . "</font> est déjà pris</strong></p>";
|
||
if ($server<>'') {
|
||
$msg="Domaine '$domaine' enregistr<74> chez '$server'";
|
||
$fp = fsockopen($server, 43, $errno, $errstr, 10);
|
||
fputs($fp, $domaine."\r\n");
|
||
$buf2='';
|
||
while (!feof($fp))
|
||
$buf2.=fgets($fp, 128);
|
||
fclose($fp);
|
||
}
|
||
}
|
||
}
|
||
}
|
||
return array( 'error'=>$msg,
|
||
'whoisSrv'=>$hote,
|
||
'whoisSrv2'=>$server,
|
||
'whoisTxt'=>$buf,
|
||
'whoisTxt2'=>$buf2);
|
||
}
|
||
|
||
|
||
}
|
||
|
||
/** Retourne l'extension d'un domaine ou du fichier !!! **/
|
||
function getFileExtension($filepath) {
|
||
preg_match('/[^?]*/', $filepath, $matches);
|
||
$string = $matches[0];
|
||
$pattern = preg_split('/\./', $string, -1, PREG_SPLIT_OFFSET_CAPTURE);
|
||
|
||
// check if there is any extension
|
||
if(count($pattern) == 1)
|
||
return false;
|
||
|
||
if(count($pattern)>1) {
|
||
$filenamepart = $pattern[count($pattern)-1][0];
|
||
preg_match('/[^?]*/', $filenamepart, $matches);
|
||
return $matches[0];
|
||
}
|
||
}
|
||
|
||
function ShowFileName($filepath)
|
||
{
|
||
preg_match('/[^?]*/', $filepath, $matches);
|
||
$string = $matches[0];
|
||
#split the string by the literal dot in the filename
|
||
$pattern = preg_split('/\./', $string, -1, PREG_SPLIT_OFFSET_CAPTURE);
|
||
#get the last dot position
|
||
$lastdot = $pattern[count($pattern)-1][1];
|
||
#now extract the filename using the basename function
|
||
$filename = basename(substr($string, 0, $lastdot-1));
|
||
#return the filename part
|
||
return $filename;
|
||
}
|
||
|
||
|
||
/********************************************************************************
|
||
* @proto (array) $page get_web_file( (string) $url[, (string) $user_agent ] )
|
||
|
||
* @desc cURL va chercher $url en temps que $user_agent
|
||
* @desc et retourne entre autre $page['content']
|
||
|
||
* @comm -r<>gler le timeout
|
||
* @comm a du mal <20> renvoyer des bin dans $header['content']
|
||
********************************************************************************/
|
||
|
||
function get_web_file( $url, $user_agent = 'Opera/9.64 (X11; Linux i686; U; en) Presto/2.1.1' )
|
||
{
|
||
$options = array(
|
||
//CURLOPT_SSL_VERIFYPEER => false, // Ne v<>rifie pas les certificats
|
||
CURLOPT_RETURNTRANSFER => true, // return plut<75>t que echo|print
|
||
CURLOPT_HEADER => true, // Renvoie les headers
|
||
//CURLOPT_FOLLOWLOCATION => true, // Suivre les redirections, limit<69> par...
|
||
CURLOPT_MAXREDIRS => 500, // ...le max de redirections
|
||
CURLOPT_ENCODING => '', // Accepte tous les encodages
|
||
CURLOPT_USERAGENT => $user_agent,// Qui Je Suis (avec Jackie Chan)
|
||
CURLOPT_AUTOREFERER => true, // Set referer on redirect
|
||
CURLOPT_CONNECTTIMEOUT => 5,
|
||
CURLOPT_TIMEOUT => 3, // Temps maximum d'utilisation de cURL (s)
|
||
);
|
||
|
||
$ch = curl_init($url);
|
||
curl_setopt_array($ch, $options);
|
||
$content = curl_exec($ch); // contenu de la page
|
||
$page['errno'] = curl_errno($ch); // code d'erreur cURL
|
||
$page['errmsg'] = curl_error($ch); // message d'erreur cURL
|
||
$curl = curl_getinfo($ch);
|
||
curl_close($ch);
|
||
|
||
$page['url'] = $curl['url']; // possibles redirections = derni<6E>re url
|
||
$page['http_code'] = $curl['http_code'];
|
||
$page['total_time'] = $curl['total_time'];
|
||
$page['redirect_count'] = $curl['redirect_count'];
|
||
$page['content'] = $content;
|
||
|
||
return $page;
|
||
}
|
||
/* $page return as below:
|
||
Array
|
||
(
|
||
[url] => l'url
|
||
[http_code] => l'erreur http
|
||
[redirect_count] => nombre de redirections
|
||
[total_time] => temps d'execution
|
||
[errno] => num<75>ro d'erreur cURL
|
||
[errmsg] => message d'erreur
|
||
[content] => fichier en string :p
|
||
)
|
||
*/
|
||
|
||
///////////////////////////////////////////////////////////////////////////////
|
||
|
||
/********************************************************************************
|
||
* @proto (integer) $http_error get_http_error( (string) $url )
|
||
********************************************************************************/
|
||
|
||
function get_http_error( $url )
|
||
{
|
||
$options = array(
|
||
//CURLOPT_SSL_VERIFYPEER => false,
|
||
CURLOPT_RETURNTRANSFER => true,
|
||
CURLOPT_HEADER => true,
|
||
CURLOPT_NOBODY => true, // get_http_error() doit <20>tre faster than get_web_file()
|
||
//CURLOPT_FOLLOWLOCATION => true,
|
||
CURLOPT_MAXREDIRS => 500,
|
||
CURLOPT_ENCODING => '',
|
||
CURLOPT_USERAGENT => 'Opera/9.64 (X11; Linux i686; U; en) Presto/2.1.1',
|
||
CURLOPT_AUTOREFERER => true,
|
||
CURLOPT_CONNECTTIMEOUT => 120,
|
||
CURLOPT_TIMEOUT => 0,
|
||
);
|
||
|
||
$ch = curl_init($url);
|
||
curl_setopt_array($ch, $options);
|
||
curl_exec($ch);
|
||
$errmsg = curl_error($ch);
|
||
$curl = curl_getinfo($ch);
|
||
curl_close($ch);
|
||
|
||
return ( $curl['http_code'] === 0 ) ? $errmsg : $curl['http_code'];
|
||
}
|
||
|
||
///////////////////////////////////////////////////////////////////////////////
|
||
|
||
/********************************************************************************
|
||
* @proto (string) $cleansed_url resolve_url( (string) $raw_url, (string) $url )
|
||
* @link http://en.wikipedia.org/wiki/Uniform_Resource_Locator
|
||
* @desc reconstruit $raw_url <20> partir de $url et en d<>duit des $res['url']
|
||
* @comm g<>re les mails
|
||
********************************************************************************/
|
||
|
||
function resolve_url( $raw_url, $url )
|
||
{
|
||
$cleansed_url = '';
|
||
|
||
// Met un slash <20> la fin des racines si yen a pas
|
||
if ( preg_match('#^[^:/]+:/+[^/]+$#', $url) ) $url = $url.'/';
|
||
if ( preg_match('#^[^:/]+:/+[^/]+$#', $raw_url) ) $raw_url = $raw_url.'/';
|
||
|
||
// Racine du site
|
||
preg_match('#^([^:/]+:/+[^/]+/)#', $url, $preg_racine);
|
||
$racine = $preg_racine[1];
|
||
// Dossier courant
|
||
preg_match('#(.+/)[^/]*$#', $url, $preg_courant);
|
||
$courant = $preg_courant[1];
|
||
/* On sait qu'un dossier en est un quand il se termine par un /
|
||
Pour cette m<>me raison, on ne peut d<>terminer avec certitude si on a un fichier plut<75>t qu'un dossier.
|
||
Sauf (dans un contexte initial : sans URL-Rewriting) quand il y a une ancre : c'est un fichier. */
|
||
// Non-dossier courant
|
||
$sub_nondir_courant = substr($url, strlen($courant) - 1);
|
||
$nondir_courant = ( $sub_nondir_courant === false ) ? '' : $sub_nondir_courant;
|
||
//Scheme et slashs
|
||
|
||
|
||
/* Modif's */
|
||
|
||
if ( $raw_url[0] == '/' ) // Si $raw_url commence par un /
|
||
$cleansed_url = $racine . substr($raw_url, 1); // substr() emp<6D>che le double slash
|
||
|
||
else if ( $raw_url[0] == '.' ) // Pour ./ et ../
|
||
$cleansed_url = $courant . $raw_url;
|
||
|
||
else if ( $raw_url[0] == '?' || $raw_url[0] == '&' ) // Si c'est une query
|
||
$cleansed_url = $url . $raw_url; // Le '&' : espoir
|
||
|
||
else if ( $raw_url[0] == '#' ) // Si c'est une ancre et que n'en a pas d<>j<EFBFBD> une
|
||
$cleansed_url = ( !preg_match('%#[^#]*$%', $url) ) ? $url.$raw_url : '';
|
||
|
||
else if ( preg_match('#^javascript\s*:#i', $raw_url) ) // Quand du javascript est d<>clar<61>
|
||
$cleansed_url = '';
|
||
|
||
else if ( preg_match('#^mailto\s*:\s*((?:[^i]|i)+)$#i', $raw_url, $mail) ) // Quand c'est un mail
|
||
$cleansed_url = '';
|
||
|
||
else if ( preg_match('#^[^:/]+:/#', $raw_url) ) // Quand raw_url est une url normale
|
||
$cleansed_url = $raw_url;
|
||
|
||
else if ( !preg_match('#^[^:/]+:/#', $raw_url) ) // Quand c'est tout sauf ce qu'on a dit et une url
|
||
$cleansed_url = $courant . $raw_url;
|
||
|
||
else { } // Tous les autres cas ne conviennent pas
|
||
|
||
|
||
/* Apr<70>s toutes les modif's */
|
||
|
||
// Supprime les r<>sultats qui ne commencent pas par xxx://
|
||
if (
|
||
!preg_match('#^[^:/]+:/+#', $cleansed_url) //$cleansed_url != ^http://$
|
||
//|| $raw_url[0] == '#' //$raw_url ^#
|
||
|| preg_match('#^[^:/]+:/+\s*$#', $cleansed_url) //$cleansed_url == ^http:// $
|
||
|| !is_string($cleansed_url)
|
||
)
|
||
$cleansed_url = '';
|
||
|
||
if ( !empty($cleansed_url) )
|
||
{
|
||
$cleansed_url = trim($cleansed_url);
|
||
|
||
/*** remplace /./ par / ***/
|
||
$cleansed_url = str_replace('/./', '/', $cleansed_url);
|
||
|
||
/*** remplace // par / s'ils ne commencent pas par : comme dans 'file:///' ***/
|
||
$cleansed_url = preg_replace('#(?<!\:)//#', '/', $cleansed_url); //does work!!!
|
||
|
||
/*** supprime les /../ ***/
|
||
while ( substr_count($cleansed_url, "../") ) //tant qu'il y a des /../
|
||
{
|
||
$cleansed_url = preg_replace('#/[^/]+/\.\.#', '', $cleansed_url); // But : see PHP.net online manual comment
|
||
}
|
||
}
|
||
|
||
return $cleansed_url;
|
||
}
|
||
|
||
///////////////////////////////////////////////////////////////////////////////
|
||
|
||
/********************************************************************************
|
||
* @proto (array) $urls get_urls( (string) $raw_page_content, (string) $url )
|
||
* @desc extrait les urls de $raw_page_content et les reconstruits <20> l'aide de $url
|
||
********************************************************************************/
|
||
|
||
function get_urls( $raw_page_content, $url )
|
||
{
|
||
$raw_page_content .= PHP_EOL;
|
||
$matches = $urls = array();
|
||
|
||
$regexs = array( // <20> r<><72>crire avec % ... %x et % ... %u
|
||
/* Fais chier : un jour que j'am<61>liorais *grave* mes regexs, le ventirad s'est d<>croch<63> et mon /home/ est devenu illisible...
|
||
Tout marchait *tr<74>s bien* et j'avais rajout<75> des captures ! Me disant m<>me "Heureusement que je l'ai vu <20>a, <20>a aurait pu m'<27>chapper !" */
|
||
|
||
// (string) '#regex#Z' => (int) parenthesized subpattern,
|
||
///////////////////////////////////////////////////////////////////////////////
|
||
/// Catch Abstrait Sym<79>trique
|
||
// Cas g<>n<EFBFBD>ral - ko
|
||
// '%(["\'])([^\1:/\s]+:/+[^\1]+)\1#%' => 2,
|
||
// Th<54>mes r<>currents en (x)?HTML - ok
|
||
'%(?:href|src|ur[li]|path|action|role|xmlns(?::[^:=]+)?)\s*=\s*(["\'])\s*([^\1]+)\s*\1%Ui' => 2,
|
||
|
||
/// Exceptions
|
||
// Instructions de robots.txt - ok
|
||
'%(?:Disallow|Sitemap)\s*:\s+(\S+)\s*%i' => 1,
|
||
// Image en CSS - ok - ok
|
||
'%url\s*\((["\']|)(?(1)([^\1]+)\1|([^()]+))\)%Ui' => 2,
|
||
'%url\s*(["\'])([^\1])\1%iU' => 2,
|
||
// Meta refresh - ok
|
||
'%content=(["\'])[^\1\D]*;\s*url=([^\1]+)\1%iU' => 2,
|
||
|
||
/// Catch Abstrait Asym<79>trique
|
||
// Parenth<74>ses
|
||
/// '\( , ... \)' => ,
|
||
// XML - (antislash pour php-> ?\>) . Ungreedy r<>duit l'execution - ok
|
||
'%<([^<>\s]+)(?:\s+[^>]+)?\>\s*([^>\s:/]+:/+[^<]+)\s*</\1>%' => 2,
|
||
///////////////////////////////////////////////////////////////////////////////
|
||
// '' => ,
|
||
);
|
||
|
||
foreach ( $regexs as $regex => $parenthesized_subpattern )
|
||
{
|
||
preg_match_all($regex, $raw_page_content, $reg_sult, PREG_PATTERN_ORDER); // <20>x<EFBFBD>cute les regular expressions
|
||
$matches = array_merge($matches, $reg_sult[ $parenthesized_subpattern ]); // Regroupe les r<>sultats des regex
|
||
}
|
||
|
||
$matches = array_unique($matches); // Supprime les doubles
|
||
$matches = array_values($matches); // Repart de z<>ro
|
||
|
||
$url = trim($url); // Enl<6E>ve les espaces des c<>t<EFBFBD>s
|
||
$nbUrl_=count($matches);
|
||
foreach ( $matches as $iUrl_=>$raw_url )
|
||
{
|
||
$raw_url = trim($raw_url);
|
||
|
||
if ( !empty($raw_url) ) // Supprime les cases vides
|
||
{
|
||
$res = resolve_url($raw_url, $url); // R<>soud massivement les URL trouv<75>es
|
||
|
||
/* D<>duit des urls : <20>num<75>re l'arborescence */
|
||
if ( !empty($res) ) // URL n<>ttoy<6F>es
|
||
{
|
||
preg_match('#^([^:/]+:/+[^/]+/)(.*)$#', $res, $preg_url);
|
||
if (!isset($preg_url[1])) {
|
||
echo ("$iUrl_/$nbUrl_: Type d'URL non g<>r<EFBFBD>e : '$res'".EOL);
|
||
break;
|
||
}
|
||
$str = $preg_url[1]; // = d<>but de l'url suivie d'un slash
|
||
|
||
$urls[] = $str; // Enregistre d<>j<EFBFBD> ce d<>but (utile de le mettre ici au cas ou la condition sur \2 serait false)
|
||
|
||
if ( !empty($preg_url[2]) ) // Si il y a un path (!= root<6F>| host)
|
||
{
|
||
// <20>clate le path par le slash (-> array)
|
||
$exp = explode('/', $preg_url[2]); // Compte <20> partir de 0
|
||
$k = count($exp); // Compte <20> partir de 1
|
||
|
||
for ( $i = 0; $i < $k; $i++ ) // D<>duit des urls
|
||
{
|
||
// Si on est au dernier, on ne met pas de slash <20> la fin (: cela peut-<2D>tre un fichier)
|
||
$str .= ($i == $k -1) ? $exp[$i] : $exp[$i].'/';
|
||
|
||
$urls[] = $str; // Enregistre
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
$urls = array_unique($urls); //moins doubles
|
||
$urls = array_values($urls); //orde num (des key)
|
||
if ( empty($urls[ count($urls) - 1 ]) ) // Si il y a une URL vide, elle est <20> la fin du tableau
|
||
array_pop($urls);
|
||
|
||
return $urls;
|
||
}
|
||
function crawler($q) {
|
||
if ( preg_match('#[^:/\s]://.+#', $q) && !preg_match('#[<>"\']#', $q) ) // Si l'URL semble correcte
|
||
{
|
||
$urls = array();
|
||
if ( preg_match('#^[^:/]+:/+[^/]+$#', $q) )
|
||
$q .= '/'; // Rajoute le slash si c'est le root et qu'il est sans / <20> la fin
|
||
|
||
/* if ( !empty($c) ) /* HTTP ERROR */ /*** Ne pas echo avant cette ligne ***
|
||
{
|
||
header("Content-Type: text/plain");
|
||
echo get_http_error($q); // get_http_error() devrait moins consommer que curl_web_file()
|
||
die;
|
||
}
|
||
*/
|
||
$curl = get_web_file($q); // Met l'<27>l<EFBFBD>ment du Web cibl<62> dans une variable
|
||
$urls = get_urls($curl['content'], $q); // URL trouv<75>es
|
||
|
||
natcasesort($urls); // Tri par ordre alphanum<75>rique
|
||
$nombresult = count($urls);
|
||
}
|
||
return $urls;
|
||
}
|
||
|
||
?>
|