backoffice/includes/partenaires/classMWeb.php
2011-06-21 13:28:10 +00:00

772 lines
30 KiB
PHP
Raw Permalink Blame History

<?
class MSitesWeb {
private $whois_serveurs = array(
"ac" => "whois.nic.ac",
"al" => "whois.ripe.net",
"am" => "whois.amnic.net",
"as" => "whois.nic.as",
"at" => "whois.ripe.net",
"au" => "whois.aunic.net",
"az" => "whois.ripe.net",
"ba" => "whois.ripe.net",
"be" => "whois.ripe.net",
"bg" => "whois.ripe.net",
"biz" => "whois.nic.biz",
"br" => "whois.registro.br",
"by" => "whois.ripe.net",
"ca" => "whois.cira.ca",
"cc" => "whois.nic.cc",
"ch" => "whois.nic.ch",
"ck" => "whois.ck-nic.org.ck",
"cn" => "whois.cnnic.net.cn",
"com" => "whois.crsnic.net",
"cx" => "whois.nic.cx",
"cy" => "whois.ripe.net",
"cz" => "whois.nic.cz",
"de" => "whois.denic.de",
"dk" => "whois.dk-hostmaster.dk",
"dz" => "whois.ripe.net",
"edu" => "rs.internic.net",
"ee" => "whois.ripe.net",
"eg" => "whois.ripe.net",
"es" => "whois.ripe.net",
"eu" => "whois.eu",
"fi" => "whois.ripe.net",
"fj" => "whois.usp.ac.fj",
"fo" => "whois.ripe.net",
"fr" => "whois.nic.fr",
"gb" => "whois.ripe.net",
"ge" => "whois.ripe.net",
"gov" => "whois.nic.gov",
"gr" => "whois.ripe.net",
"gs" => "whois.adamsnames.tc",
"hk" => "whois.hknic.net.hk",
"hm" => "whois.registry.hm",
"hr" => "whois.ripe.net",
"hu" => "whois.ripe.net",
"id" => "whois.idnic.net.id",
"ie" => "whois.domainregistry.ie",
"info" => "whois.afilias.net",
"int" => "whois.isi.edu",
"il" => "whois.ripe.net",
"is" => "whois.isnet.is",
"it" => "whois.nic.it",
"jp" => "whois.nic.ad.jp",
"ke" => "whois.rg.net",
"kg" => "whois.domain.kg",
"kr" => "whois.nic.or.kr",
"kz" => "whois.domain.kz",
"li" => "whois.nic.li",
"lk" => "whois.nic.lk",
"lt" => "whois.ripe.net",
"lu" => "whois.ripe.net",
"lv" => "whois.ripe.net",
"ma" => "whois.ripe.net",
"md" => "whois.ripe.net",
"mil" => "whois.nic.mil",
"mk" => "whois.ripe.net",
"mm" => "whois.nic.mm",
"ms" => "whois.adamsnames.tc",
"mt" => "whois.ripe.net",
"mx" => "whois.nic.mx",
"net" => "rs.internic.net",
"nl" => "whois.domain-registry.nl",
"no" => "whois.norid.no",
"nu" => "whois.nic.nu",
"nz" => "whois.domainz.net.nz",
"org" => "whois.pir.org",
"pl" => "whois.ripe.net",
"pk" => "whois.pknic.net.pk",
"pt" => "whois.ripe.net",
"ro" => "whois.ripe.net",
"ru" => "whois.ripn.ru",
"se" => "whois.nic-se.se",
"sg" => "whois.nic.net.sg",
"si" => "whois.ripe.net",
"sh" => "whois.nic.sh",
"sk" => "whois.ripe.net",
"sm" => "whois.ripe.net",
"st" => "whois.nic.st",
"su" => "whois.ripe.net",
"tc" => "whois.adamsnames.tc",
"tf" => "whois.adamsnames.tc",
"tj" => "whois.nic.tj",
"th" => "whois.thnic.net",
"tm" => "whois.nic.tm",
"tn" => "whois.ripe.net",
"to" => "whois.tonic.to",
"tr" => "whois.ripe.net",
"tw" => "whois.twnic.net",
"ua" => "whois.ripe.net",
"uk" => "whois.nic.uk",
"us" => "whois.isi.edu",
"va" => "whois.ripe.net",
"vg" => "whois.adamsnames.tc",
"ws" => "whois.nic.ws",
"yu" => "whois.ripe.net",
"za" => "whois.frd.ac.za");
private $iInsee;
private $iDb;
function __construct(/*$siren, $accesDist=true*/) {
$this->iInsee=new MInsee();
$this->iDb=new WDB('jo');
}
/** L'adresse IP est elle valide ?
* @param $ip Adresse IP v4
*/
function isIpValide($ip) {
$ip_explode = explode('.',$ip);
$nb_valide = 0;
foreach ($ip_explode as $element)
if ($element>=0 && $element<255)
$nb_valide++;
if ($nb_valide==4) return true;
return false;
}
function getInfosSiteWeb($url) {
$tabRet=array('url'=>$url, 'url_valide'=>0);
// Est-ce une URL valide en param<61>tre ?
if(preg_match('|^(http(s)?://)?[a-z0-9-]+(.[a-z0-9-]+)*(:[0-9]+)?(/.*)?$|i', $url)) {
//$tabRet['url']=preg_replace('/^http\/\//', 'http://', $url);
$tabRet['url_valide']=1;
$info=parse_url($url);
$host=@preg_replace('/\/$/','',$info['host']);
$ext=getFileExtension($host);
$domaine=getFileExtension(preg_replace("/\.$ext$/",'','.'.$host));
$tabRet['url_scheme']=@$info['scheme'];
$tabRet['url_host']=@$info['host'];
$tabRet['url_path']=@$info['path'];
$tabRet['url_query']=@$info['query'];
$tabRet['url_fragment']=@$info['fragment'];
$tabRet['domaine']="$domaine.$ext";
$tabRet['domaine_valide']=0;
// Le nom de domaine est il actif ?
if (checkdnsrr("$domaine.$ext")) {
$tabRet['domaine_valide']=1;
$ip=gethostbyname("$domaine.$ext");
if ($this->isIpValide($ip)) $tabRet['ip']=$ip;
$page=getUrl("http://$host", '', '', '', false, "$domaine.$ext", '', 3, 0);
// Le site est il actif ?
if ($page['code']==400 || $page['code']==408) {
$tabRet['erreur_num']=@$page['header']['curl_errno'];
$tabRet['erreur_txt']=@$page['header']['curl_error'];
} else {
/** @todo V<>rifier les pages de parking et les sites non encore termin<69>s **/
$tabRet['code']=$page['code'];
//$tabRet['header']=$page['header'];
$tabRet['header_server']=trim($page['header']['Server']);
$tabRet['header_location']=trim($page['header']['Location']);
$tabRet['header_powerby']=trim($page['header']['X-Powered-By']);
$tabRet['header_content']=trim($page['header']['Content-Type']);
$tabRet['html_size']=strlen($page['body']);
if (preg_match('/<head>(.*)<\/head>/Uis',$page['body'],$matches)) {
$html_head=$matches[1];
$tabRet['html_head']=$html_head;
if (preg_match('/<title>(.*)<\/title>/Uis',$html_head,$matches))
$tabRet['html_title']=utf8_decode(trim($matches[1]));
if (preg_match_all('/<meta\s+name(?:\s+|)=(.*)\scontent(?:\s+|)=(.*)>/Uis',$html_head,$matches)) {
foreach ($matches[1] as $i=>$metaName) {
$meta=strtolower(trim(strtr($metaName, array('"'=>'', "'"=>''))));
$content=utf8_decode(preg_replace('/^(?:\s+|)("|\')/Uis','',
preg_replace('/("|\')(?:\s+|)(?:\/|)$/Uis','',
$matches[2][$i])));
switch ($meta) {
case 'title':
$meta='metatitle';
case 'content-language':
$meta='language';
case 'languages':
$meta='language';
case 'language':
case 'revisit-after':
case 'coverage':
case 'copyright':
case 'author':
case 'rating':
case 'resource-type':
case 'classification':
case 'distribution':
case 'doc-rights':
case 'doc-type':
case 'robots':
case 'keywords':
case 'description':
case 'generator':
case 'category':
case 'owner':
case 'identifier-url':
$tabRet['html_'.$meta]=utf8_decode(trim($content));
break;
default:
echo "$meta='$content'".EOL;
break;
}
}
}
}
}
}
}
return $tabRet;
}
function getInfosAfnic($siteWeb) {
$referer='http://www.afnic.fr/';
$url='http://www.afnic.fr/outils/whois/'.$siteWeb;
$page=getUrl($url, '', '', $referer);
if ($page['code']==200) {
$tabRet=array();
if (preg_match('/<b>Bureau d&rsquo;enregistrement \: <\/b>.*">(.*)<\/a><br>/Uis',$page['body'],$matches))
$tabRet['registrar']=$matches[1];
if (preg_match('/<\/noscript><b>&Eacute;tat \:<\/b>(.*)<\/b>/Uis',$page['body'],$matches)) {
$tabRet['etat']=trim($matches[1]); /*
Actif (consultez aussi le <b><a href="http://www.decideo.fr" class="ext" target="_BLANC">Site web</a>*/
if (preg_match('/<b><a href="(.*)"/Uis',$tabRet['etat'],$matches))
$tabRet['siteWeb']=trim($matches[1]);
$tmp=explode(' (', $tabRet['etat']);
$tabRet['etat']=$tmp[0];
}
if (preg_match('/<b>Date de cr<63>ation \: <\/b>(.*)<br>/Uis',$page['body'],$matches))
$tabRet['dateCrea']=$matches[1];
if (preg_match('/<b>Date anniversaire \: <\/b>(.*)<br>/Uis',$page['body'],$matches))
$tabRet['dateAnniv']=trim($matches[1]);
if (preg_match("/<span class=h1>Titulaire \: <\/span>(.*)<br><div style='clear\: both;'><\/div>/Uis",$page['body'],$matches)) {
$strTitu=trim($matches[1]);
if (preg_match("/<span class=bleuvif>(.*)<\/span><\/h2>/Uis",$strTitu,$matches))
$tabRet['tituNom']=trim($matches[1]);
if (preg_match("/<noscript><div id='Layer2' style='display\: block;'><\/noscript>(.*)<b>/Uis",$strTitu,$matches)) {
$tabRet['tituAdr']=strip_tags(trim($matches[1]));
$tmp=$this->iInsee->structureVoie($tabRet['tituAdr']);
$tabRet['tituAdrNum']=$tmp['num'];
$tabRet['tituAdrCp']=$tmp['cp'];
$tabRet['tituAdrVille']=preg_replace('/ FRANCE$/','',trim($tmp['ville']));
$tabRet['tituAdrTypVoie']=$tmp['typeVoie'];
$tabRet['tituAdrLibVoie']=$tmp['libVoie'];
$tabRet['tituAdrComp']=$tmp['adrComp0'];
}
if (preg_match("/<b>T<>l<EFBFBD>phone \: <\/b>(.*)<br>/Uis",$strTitu,$matches))
$tabRet['tituTel']=trim($matches[1]);
if (preg_match('/<b>Courrier <20>lectronique \: <\/b> <a href="mailto:(.*)">/Uis',$strTitu,$matches))
$tabRet['tituMel']=trim($matches[1]);
}
if(preg_match('/<li>Identifi<66> gr<67>ce au num<75>ro de SIREN <a href="(?:.*) target="_blank" class="ext">(.*)<\/a><\/li>/Uis', $page['body'], $matches))
$tabRet['siren']=html_entity_decode($matches[1]);
elseif (preg_match('/<li>Identifi<66> gr<67>ce <20> la marque fran<61>aise \((.*)\) <\/li>/Uis', $page['body'], $matches))
$tabRet['marquefr']=html_entity_decode($matches[1]);
} else
return $page['code'];
return $tabRet;
}
function findSiteWeb($siren, $nomEntrep='') {
// Recherche BING
$appId = '56D6CBA671C986D3EA11B1B48F97507BC5A00D51';
$numResults = 50;
$cultureInfo = 'fr-FR';
$siren2=implode(' ', str_split($siren, 3));
if ($nomEntrep<>'') $rs2="OR \"$nomEntrep\"";
$tabSitesExclus=array('societe.com','bilans.net','gouv.fr','info-financiere.fr','bodacc.fr','manageo.fr','bilansgratuits.fr','lesechos.fr','google.fr');
$rs="$siren OR \"$siren2\" $rs2 -site:".implode(' -site:', $tabSitesExclus);
$query=stripslashes(urlencode($rs));
$tabSources=array( 'web'=>'Web page results',
/*'image'=>'Full-size image and thumbnail image information, including the file size in bytes (if available), height and width in pixels (if available), and the URI to the full-size image or thumbnail',
'instantAnswer'=>'Answers. The result fields returned for requests that specify InstantAnswer vary based on the value or values specified for the Query property. InstantAnswer results can include Encarta, FlightStatus, Finance, Music, Sports, Weather, and Movie ShowTimes. For the Version 2.0 release, results include Encarta and FlightStatus only. Other results are available by invitation',
'mobileWeb'=>'Mobile Web page results (primarily Extensible Hypertext Markup Language (XHTML) and Wireless Markup Language (WML)',
'phoneBook'=>'Results from online White Pages (residential) and Yellow Pages (commercial) entries',
'relatedSearch'=>'Suggestions for other searches related to the query term or terms',
'spell'=>'Spelling suggestions',
'translation'=>'Translated results for a queried',
'video'=>'Video results',*/
//'news'=>'Results from online news services',
);
$source=implode('+', array_keys($tabSources));
$url="http://api.bing.net/json.aspx?AppId=$appId&Version=2.2&Market=$cultureInfo&Query=$query&Sources=$source&Web.Count=$numResults&JsonType=raw";
$page=getUrl($url, '', '', $referer, false);
$json=$page['body'];
$tabJson=json_decode($json, true);
$tabJson=$tabJson['SearchResponse'];
$levMin=100;
$pctMin=0;
$urlLev=$urlPct='';
$urlapprox="http://www.$nomEntrep.fr/";
foreach ($tabJson['Web']['Results'] as $i=> $result) {
$title=utf8_decode($result['Title']); // SCORES & DECISIONS - Accueil
$desc=utf8_decode($result['Description']); // Scores et Décisions - Le nouvel acteur de l'information sur les entreprises et leurs dirigeants avec une approche innovante
$url=$result['Url']; // http://www3.scores-decisions.com/
$lev=@levenshtein ($urlapprox,$url);
if ($lev>0 && $lev<$levMin) {
$levMin=$lev;
$urlLev=$url;
}
$sim=similar_text($urlapprox,$url,$pct);
if ($pct>$pctMin && strpos($url, 'zonebourse')===false) {
$pctMin=$pct;
$urlPct=$url;
}
if (preg_match('/\.(.*\.fr)\//', $url, $matches2)) {
}
$info=parse_url($url);
$host=preg_replace('/\/$/','',$info['host']);
$ext=getFileExtension($host);
$domaine=getFileExtension(preg_replace("/\.$ext$/",'','.'.$host));
echo "RECHERCHE DE '$nomEntrep' ($siren) : Trouv<EFBFBD> $domaine.$ext".EOL;
if ($ext=='fr') {
$rep=$this->iDb->select('sitesWeb', 'siren, web', "web=$url");
if (@$rep[0]['siren']*1==0) {
$tabAfnic=$this->getInfosAfnic("$domaine.$ext");
$siren=$tabAfnic['siren']*1;
if ($siren>0) {
$tabInsert=array('siren'=>$siren,
'web'=>$url,
'dateInsert'=>date('YmdHis'));
$this->iDb->insert('sitesWeb', $tabInsert);
}
}
}
if ($levMin<15 && $pctMin>44 && $urlLev==$urlPct) {
echo date('Y-m-d H:i:s') .' - '. $page['code'] . " - $rs - $i - $lev (Min=$levMin) - $pct (Min=$pctMin) - $urlLev - $urlPct - $url !!! RETURNED !!!".EOL;
return $urlLev;
}
}
/*
[0] => Array
(
[Title] => SCORES & DECISIONS - Accueil
[Description] => Scores et Décisions - Le nouvel acteur de l'information sur les entreprises et leurs dirigeants avec une approche innovante
[Url] => http://www3.scores-decisions.com/
[CacheUrl] => http://cc.bingj.com/cache.aspx?q=scores+decisions&d=5004075153885515&mkt=fr-FR&w=881d2897,4f2fff68
[DisplayUrl] => www3.scores-decisions.com
[DateTime] => 2011-02-14T12:24:00Z
[DeepLinks] => Array
(
[0] => Array
(
[Title] => Partenaires
[Url] => http://www3.scores-decisions.com/partenaires.php
)
[1] => Array
(
[Title] => Contact
[Url] => http://www3.scores-decisions.com/contact.php
)
)
)
[1] => Array
(
[Title] => SCORES & DECISIONS - Société
[Description] => Scores et Décisions - Le nouvel acteur de l'information sur les entreprises et leurs dirigeants avec une approche innovante
[Url] => http://www3.scores-decisions.com/societe.php
[CacheUrl] => http://cc.bingj.com/cache.aspx?q=scores+decisions&d=4747772983970513&mkt=fr-FR&w=10f5cd33,9b81f773
[DisplayUrl] => www3.scores-decisions.com/societe.php
[DateTime] => 2011-02-13T02:17:00Z
)*/
return false;
}
function whois($domaine) {
$parseur=explode(".", $domaine);
$hote=$this->whois_serveurs[strtolower($parseur[count($parseur)-1])];
$msg='';
if (empty($hote)) {
$msg="Extension du domaine '$domaine' inconnue";
} else {
$fp = fsockopen($hote, 43, $errno, $errstr, 10);
if (!$fp) {
$msg="Erreur de socket no$errno : $errstr";
} else {
fputs($fp, $domaine . "\r\n");
$buf=$server='';
while (!feof($fp)) {
$row=fgets($fp, 128);
$buf.=$row;
if (eregi("Whois Server:", $row))
$server = trim(str_replace('Whois Server:', '', $row));
}
fclose($fp);
if (ereg("No match for", $buf) ||
ereg("NOT FOUND", $buf) ||
ereg("Status: FREE", $buf) ||
ereg("No entries found", $buf) ||
ereg("Not found", $buf) ||
ereg("AVAIL", $buf)) {
$msg="Domaine '$domaine' libre";
} else {
//echo "<p><strong>Le nom de domaine <font color=\"red\">" . $_POST['domaine'] . "</font> est d&eacute;j&agrave; pris</strong></p>";
if ($server<>'') {
$msg="Domaine '$domaine' enregistr<74> chez '$server'";
$fp = fsockopen($server, 43, $errno, $errstr, 10);
fputs($fp, $domaine."\r\n");
$buf2='';
while (!feof($fp))
$buf2.=fgets($fp, 128);
fclose($fp);
}
}
}
}
return array( 'error'=>$msg,
'whoisSrv'=>$hote,
'whoisSrv2'=>$server,
'whoisTxt'=>$buf,
'whoisTxt2'=>$buf2);
}
}
/** Retourne l'extension d'un domaine ou du fichier !!! **/
function getFileExtension($filepath) {
preg_match('/[^?]*/', $filepath, $matches);
$string = $matches[0];
$pattern = preg_split('/\./', $string, -1, PREG_SPLIT_OFFSET_CAPTURE);
// check if there is any extension
if(count($pattern) == 1)
return false;
if(count($pattern)>1) {
$filenamepart = $pattern[count($pattern)-1][0];
preg_match('/[^?]*/', $filenamepart, $matches);
return $matches[0];
}
}
function ShowFileName($filepath)
{
preg_match('/[^?]*/', $filepath, $matches);
$string = $matches[0];
#split the string by the literal dot in the filename
$pattern = preg_split('/\./', $string, -1, PREG_SPLIT_OFFSET_CAPTURE);
#get the last dot position
$lastdot = $pattern[count($pattern)-1][1];
#now extract the filename using the basename function
$filename = basename(substr($string, 0, $lastdot-1));
#return the filename part
return $filename;
}
/********************************************************************************
* @proto (array) $page get_web_file( (string) $url[, (string) $user_agent ] )
* @desc cURL va chercher $url en temps que $user_agent
* @desc et retourne entre autre $page['content']
* @comm -r<>gler le timeout
* @comm a du mal <20> renvoyer des bin dans $header['content']
********************************************************************************/
function get_web_file( $url, $user_agent = 'Opera/9.64 (X11; Linux i686; U; en) Presto/2.1.1' )
{
$options = array(
//CURLOPT_SSL_VERIFYPEER => false, // Ne v<>rifie pas les certificats
CURLOPT_RETURNTRANSFER => true, // return plut<75>t que echo|print
CURLOPT_HEADER => true, // Renvoie les headers
//CURLOPT_FOLLOWLOCATION => true, // Suivre les redirections, limit<69> par...
CURLOPT_MAXREDIRS => 500, // ...le max de redirections
CURLOPT_ENCODING => '', // Accepte tous les encodages
CURLOPT_USERAGENT => $user_agent,// Qui Je Suis (avec Jackie Chan)
CURLOPT_AUTOREFERER => true, // Set referer on redirect
CURLOPT_CONNECTTIMEOUT => 5,
CURLOPT_TIMEOUT => 3, // Temps maximum d'utilisation de cURL (s)
);
$ch = curl_init($url);
curl_setopt_array($ch, $options);
$content = curl_exec($ch); // contenu de la page
$page['errno'] = curl_errno($ch); // code d'erreur cURL
$page['errmsg'] = curl_error($ch); // message d'erreur cURL
$curl = curl_getinfo($ch);
curl_close($ch);
$page['url'] = $curl['url']; // possibles redirections = derni<6E>re url
$page['http_code'] = $curl['http_code'];
$page['total_time'] = $curl['total_time'];
$page['redirect_count'] = $curl['redirect_count'];
$page['content'] = $content;
return $page;
}
/* $page return as below:
Array
(
[url] => l'url
[http_code] => l'erreur http
[redirect_count] => nombre de redirections
[total_time] => temps d'execution
[errno] => num<75>ro d'erreur cURL
[errmsg] => message d'erreur
[content] => fichier en string :p
)
*/
///////////////////////////////////////////////////////////////////////////////
/********************************************************************************
* @proto (integer) $http_error get_http_error( (string) $url )
********************************************************************************/
function get_http_error( $url )
{
$options = array(
//CURLOPT_SSL_VERIFYPEER => false,
CURLOPT_RETURNTRANSFER => true,
CURLOPT_HEADER => true,
CURLOPT_NOBODY => true, // get_http_error() doit <20>tre faster than get_web_file()
//CURLOPT_FOLLOWLOCATION => true,
CURLOPT_MAXREDIRS => 500,
CURLOPT_ENCODING => '',
CURLOPT_USERAGENT => 'Opera/9.64 (X11; Linux i686; U; en) Presto/2.1.1',
CURLOPT_AUTOREFERER => true,
CURLOPT_CONNECTTIMEOUT => 120,
CURLOPT_TIMEOUT => 0,
);
$ch = curl_init($url);
curl_setopt_array($ch, $options);
curl_exec($ch);
$errmsg = curl_error($ch);
$curl = curl_getinfo($ch);
curl_close($ch);
return ( $curl['http_code'] === 0 ) ? $errmsg : $curl['http_code'];
}
///////////////////////////////////////////////////////////////////////////////
/********************************************************************************
* @proto (string) $cleansed_url resolve_url( (string) $raw_url, (string) $url )
* @link http://en.wikipedia.org/wiki/Uniform_Resource_Locator
* @desc reconstruit $raw_url <20> partir de $url et en d<>duit des $res['url']
* @comm g<>re les mails
********************************************************************************/
function resolve_url( $raw_url, $url )
{
$cleansed_url = '';
// Met un slash <20> la fin des racines si yen a pas
if ( preg_match('#^[^:/]+:/+[^/]+$#', $url) ) $url = $url.'/';
if ( preg_match('#^[^:/]+:/+[^/]+$#', $raw_url) ) $raw_url = $raw_url.'/';
// Racine du site
preg_match('#^([^:/]+:/+[^/]+/)#', $url, $preg_racine);
$racine = $preg_racine[1];
// Dossier courant
preg_match('#(.+/)[^/]*$#', $url, $preg_courant);
$courant = $preg_courant[1];
/* On sait qu'un dossier en est un quand il se termine par un /
Pour cette m<>me raison, on ne peut d<>terminer avec certitude si on a un fichier plut<75>t qu'un dossier.
Sauf (dans un contexte initial : sans URL-Rewriting) quand il y a une ancre : c'est un fichier. */
// Non-dossier courant
$sub_nondir_courant = substr($url, strlen($courant) - 1);
$nondir_courant = ( $sub_nondir_courant === false ) ? '' : $sub_nondir_courant;
//Scheme et slashs
/* Modif's */
if ( $raw_url[0] == '/' ) // Si $raw_url commence par un /
$cleansed_url = $racine . substr($raw_url, 1); // substr() emp<6D>che le double slash
else if ( $raw_url[0] == '.' ) // Pour ./ et ../
$cleansed_url = $courant . $raw_url;
else if ( $raw_url[0] == '?' || $raw_url[0] == '&' ) // Si c'est une query
$cleansed_url = $url . $raw_url; // Le '&' : espoir
else if ( $raw_url[0] == '#' ) // Si c'est une ancre et que n'en a pas d<>j<EFBFBD> une
$cleansed_url = ( !preg_match('%#[^#]*$%', $url) ) ? $url.$raw_url : '';
else if ( preg_match('#^javascript\s*:#i', $raw_url) ) // Quand du javascript est d<>clar<61>
$cleansed_url = '';
else if ( preg_match('#^mailto\s*:\s*((?:[^i]|i)+)$#i', $raw_url, $mail) ) // Quand c'est un mail
$cleansed_url = '';
else if ( preg_match('#^[^:/]+:/#', $raw_url) ) // Quand raw_url est une url normale
$cleansed_url = $raw_url;
else if ( !preg_match('#^[^:/]+:/#', $raw_url) ) // Quand c'est tout sauf ce qu'on a dit et une url
$cleansed_url = $courant . $raw_url;
else { } // Tous les autres cas ne conviennent pas
/* Apr<70>s toutes les modif's */
// Supprime les r<>sultats qui ne commencent pas par xxx://
if (
!preg_match('#^[^:/]+:/+#', $cleansed_url) //$cleansed_url != ^http://$
//|| $raw_url[0] == '#' //$raw_url ^#
|| preg_match('#^[^:/]+:/+\s*$#', $cleansed_url) //$cleansed_url == ^http:// $
|| !is_string($cleansed_url)
)
$cleansed_url = '';
if ( !empty($cleansed_url) )
{
$cleansed_url = trim($cleansed_url);
/*** remplace /./ par / ***/
$cleansed_url = str_replace('/./', '/', $cleansed_url);
/*** remplace // par / s'ils ne commencent pas par : comme dans 'file:///' ***/
$cleansed_url = preg_replace('#(?<!\:)//#', '/', $cleansed_url); //does work!!!
/*** supprime les /../ ***/
while ( substr_count($cleansed_url, "../") ) //tant qu'il y a des /../
{
$cleansed_url = preg_replace('#/[^/]+/\.\.#', '', $cleansed_url); // But : see PHP.net online manual comment
}
}
return $cleansed_url;
}
///////////////////////////////////////////////////////////////////////////////
/********************************************************************************
* @proto (array) $urls get_urls( (string) $raw_page_content, (string) $url )
* @desc extrait les urls de $raw_page_content et les reconstruits <20> l'aide de $url
********************************************************************************/
function get_urls( $raw_page_content, $url )
{
$raw_page_content .= PHP_EOL;
$matches = $urls = array();
$regexs = array( // <20> r<><72>crire avec % ... %x et % ... %u
/* Fais chier : un jour que j'am<61>liorais *grave* mes regexs, le ventirad s'est d<>croch<63> et mon /home/ est devenu illisible...
Tout marchait *tr<74>s bien* et j'avais rajout<75> des captures ! Me disant m<>me "Heureusement que je l'ai vu <20>a, <20>a aurait pu m'<27>chapper !" */
// (string) '#regex#Z' => (int) parenthesized subpattern,
///////////////////////////////////////////////////////////////////////////////
/// Catch Abstrait Sym<79>trique
// Cas g<>n<EFBFBD>ral - ko
// '%(["\'])([^\1:/\s]+:/+[^\1]+)\1#%' => 2,
// Th<54>mes r<>currents en (x)?HTML - ok
'%(?:href|src|ur[li]|path|action|role|xmlns(?::[^:=]+)?)\s*=\s*(["\'])\s*([^\1]+)\s*\1%Ui' => 2,
/// Exceptions
// Instructions de robots.txt - ok
'%(?:Disallow|Sitemap)\s*:\s+(\S+)\s*%i' => 1,
// Image en CSS - ok - ok
'%url\s*\((["\']|)(?(1)([^\1]+)\1|([^()]+))\)%Ui' => 2,
'%url\s*(["\'])([^\1])\1%iU' => 2,
// Meta refresh - ok
'%content=(["\'])[^\1\D]*;\s*url=([^\1]+)\1%iU' => 2,
/// Catch Abstrait Asym<79>trique
// Parenth<74>ses
/// '\( , ... \)' => ,
// XML - (antislash pour php-> ?\>) . Ungreedy r<>duit l'execution - ok
'%<([^<>\s]+)(?:\s+[^>]+)?\>\s*([^>\s:/]+:/+[^<]+)\s*</\1>%' => 2,
///////////////////////////////////////////////////////////////////////////////
// '' => ,
);
foreach ( $regexs as $regex => $parenthesized_subpattern )
{
preg_match_all($regex, $raw_page_content, $reg_sult, PREG_PATTERN_ORDER); // <20>x<EFBFBD>cute les regular expressions
$matches = array_merge($matches, $reg_sult[ $parenthesized_subpattern ]); // Regroupe les r<>sultats des regex
}
$matches = array_unique($matches); // Supprime les doubles
$matches = array_values($matches); // Repart de z<>ro
$url = trim($url); // Enl<6E>ve les espaces des c<>t<EFBFBD>s
$nbUrl_=count($matches);
foreach ( $matches as $iUrl_=>$raw_url )
{
$raw_url = trim($raw_url);
if ( !empty($raw_url) ) // Supprime les cases vides
{
$res = resolve_url($raw_url, $url); // R<>soud massivement les URL trouv<75>es
/* D<>duit des urls : <20>num<75>re l'arborescence */
if ( !empty($res) ) // URL n<>ttoy<6F>es
{
preg_match('#^([^:/]+:/+[^/]+/)(.*)$#', $res, $preg_url);
if (!isset($preg_url[1])) {
echo ("$iUrl_/$nbUrl_: Type d'URL non g<>r<EFBFBD>e : '$res'".EOL);
break;
}
$str = $preg_url[1]; // = d<>but de l'url suivie d'un slash
$urls[] = $str; // Enregistre d<>j<EFBFBD> ce d<>but (utile de le mettre ici au cas ou la condition sur \2 serait false)
if ( !empty($preg_url[2]) ) // Si il y a un path (!= root<6F>| host)
{
// <20>clate le path par le slash (-> array)
$exp = explode('/', $preg_url[2]); // Compte <20> partir de 0
$k = count($exp); // Compte <20> partir de 1
for ( $i = 0; $i < $k; $i++ ) // D<>duit des urls
{
// Si on est au dernier, on ne met pas de slash <20> la fin (: cela peut-<2D>tre un fichier)
$str .= ($i == $k -1) ? $exp[$i] : $exp[$i].'/';
$urls[] = $str; // Enregistre
}
}
}
}
}
$urls = array_unique($urls); //moins doubles
$urls = array_values($urls); //orde num (des key)
if ( empty($urls[ count($urls) - 1 ]) ) // Si il y a une URL vide, elle est <20> la fin du tableau
array_pop($urls);
return $urls;
}
function crawler($q) {
if ( preg_match('#[^:/\s]://.+#', $q) && !preg_match('#[<>"\']#', $q) ) // Si l'URL semble correcte
{
$urls = array();
if ( preg_match('#^[^:/]+:/+[^/]+$#', $q) )
$q .= '/'; // Rajoute le slash si c'est le root et qu'il est sans / <20> la fin
/* if ( !empty($c) ) /* HTTP ERROR */ /*** Ne pas echo avant cette ligne ***
{
header("Content-Type: text/plain");
echo get_http_error($q); // get_http_error() devrait moins consommer que curl_web_file()
die;
}
*/
$curl = get_web_file($q); // Met l'<27>l<EFBFBD>ment du Web cibl<62> dans une variable
$urls = get_urls($curl['content'], $q); // URL trouv<75>es
natcasesort($urls); // Tri par ordre alphanum<75>rique
$nombresult = count($urls);
}
return $urls;
}
?>