340 lines
11 KiB
PHP
340 lines
11 KiB
PHP
#!/usr/bin/php -c/var/www/batch/config/php_batch_sd.ini
|
|
<?php
|
|
|
|
include_once(FWK_PATH.'common/curl.php');
|
|
include_once(FWK_PATH.'common/chiffres.php');
|
|
include_once(FWK_PATH.'common/dates.php');
|
|
$modeDebug=$modeDetail=$modeRCI=false;
|
|
|
|
$strInfoScript='Usage : '.basename($argv[0]). " <option>
|
|
Mise à jour des listes des sociétés de gestion agrées par l'AMF
|
|
|
|
Options :
|
|
-v Mode debug (Verbosité au maximum et fonctionnement sans timer ni contraintes temporelles)
|
|
-d Remplir les fiches détaillées
|
|
-r=13S00000 Traiter ce numéro de RCI
|
|
|
|
";/* -i:XXXXX Reprendre au code commune Insee XXXXX
|
|
|
|
";*/
|
|
|
|
$argv=$_SERVER['argv'];
|
|
|
|
for ($i=1,$j=0; isset($argv[$i]); $i++) {
|
|
if (substr($argv[$i],0,1)=='-') {
|
|
switch (substr($argv[$i],1,1)) {
|
|
case 'v': $modeDebug=true; break;
|
|
case 'd': $modeDetail=true; break;
|
|
case 'r': $modeRCI=trim(substr($argv[$i],3,strlen($argv[$i])-1)); break;
|
|
case '-':
|
|
case '?': die($strInfoScript); break;
|
|
default: die('Option '. $argv[$i] . " inconnue !\n");
|
|
}
|
|
}
|
|
}
|
|
|
|
$iDb=new WDB('tmp');
|
|
$table='elus';
|
|
$urlBase='http://senat.fr/senateur/';
|
|
$referer='http://senat.fr/senateurs/sencir.html';
|
|
/*$urlRecherche='http://www.rci.gouv.mc/resultatRecherche';
|
|
$urlDetailS='http://www.rci.gouv.mc/ficheSGratuit.jsp?rc=';
|
|
$urlDetailPP='http://www.rci.gouv.mc/fichePPGratuit.jsp?rc=';
|
|
*/
|
|
$tabZonesS=array( 'nomPrenom'=>'<div class="title">\s+<h1 class="title-01">(.*)</h1>\s+<h2 class="subtitle-02">Sénateur (.*)(\(.*)\)</h2>',
|
|
//'departement'=>'<div class="title">\s+<h1 class="title-01">(?:.*)</h1>\s+<h2 class="subtitle-02">Sénateur (.*)(\(.*)\)</h2>',
|
|
//'region'=>'<div class="title">\s+<h1 class="title-01">(?:.*)</h1>\s+<h2 class="subtitle-02">Sénateur(?:.*)(\(.*)\)</h2>',
|
|
'partiCode'=>'<li class="last">Apparenté au\s+<a href="/senateurs/(.*).html"',
|
|
'partiLib'=>'<li class="last">Apparenté au\s+<a href="/senateurs/(?:.*).html" target="_self">(.*)</a></li>',
|
|
'dateElection'=>'<h2 class="title title-02">Election</h2>(?:.*)<li>\s+Elu le (.*)</li>',
|
|
'mail'=>'<a href="mailto:(.*)" class="link-color-01">',
|
|
'dateNaiss'=>'<dt>Etat Civil</dt>\s+<dd>Né le (.*)</dd>',
|
|
'profession'=>'<dt>Profession</dt>\s+<dd>(.*)</dd><dt>Place dans',
|
|
);
|
|
$tabZonesPP=array('codeRCI'=>'<b>Numéro RCI :</b>(.*)<br>',
|
|
'etat'=>'<b>Etat :</b>(.*)<br>',
|
|
'forme'=>'<b>Structure :</b>(.*)<br>',
|
|
'rs'=>'<b>Commerce exercé sous le nom :</b>(.*)<br>',
|
|
'formePP'=>'<h1 class="titre2">Etat-civil du déclarant</h1>(?:.*)<h1 class="titre3">(.*)</h1><br>',
|
|
'nom'=>'<b>Nom :</b>(.*)<br>',
|
|
'prenom'=>'<b>Prénom :</b>(.*)<br>',
|
|
'nomJF'=>'<b>Nom de jeune fille :</b>(.*)<br>',
|
|
'activite'=>'<b>Activité de l\'établissement :</b>(.*)<br>(?:\s+)</div>',
|
|
'etab'=>'</DIV>(?:.*)<h1 class="titre2">(.*)</h1>(?:.*)<div',
|
|
'etabType'=>'<h1 class="titre3">(.*)</h1><br>',
|
|
'etabEns'=>'<h1 class="titre4">(.*)</h1>',
|
|
'ensAdr1'=>'<td><b>Adresse :</b></td>(?:.*)<td>(.*)<br>(?:.*)<br>(?:.*)<br>(?:.*)</td>',
|
|
'ensAdr2'=>'<td><b>Adresse :</b></td>(?:.*)<td>(?:.*)<br>(.*)<br>(?:.*)<br>(?:.*)</td>',
|
|
'ensAdr3'=>'<td><b>Adresse :</b></td>(?:.*)<td>(?:.*)<br>(?:.*)<br>(.*)<br>(?:.*)</td>',
|
|
'ensAdr4'=>'<td><b>Adresse :</b></td>(?:.*)<td>(?:.*)<br>(?:.*)<br>(?:.*)<br>(.*)</td>',
|
|
);
|
|
|
|
echo date ('Y/m/d - H:i:s')." - Début de la mise à jour des entreprises '$table'...".EOL;
|
|
//$letIni=file_get_contents (REP_TEMP.$table);
|
|
//$letIni='AVK';
|
|
|
|
|
|
|
|
|
|
$page=getUrl($referer, '', '', '', false);
|
|
$body=$page['body'];
|
|
/** Ventilation par départements **/
|
|
if (preg_match_all('/title\-05">(.*)<\/h2>(?:.*)<ul class="list\-type\-03">(.*)<\/ul>/Uis', $body, $matches)) {
|
|
//print_r($matches);
|
|
//die();
|
|
foreach($matches[1] as $i=>$libDept) {
|
|
$listeSenateursHtm=$matches[2][$i];
|
|
//echo "$i\t$libDept".EOL;
|
|
/** Ventillation par député du département **/
|
|
if (preg_match_all('/<li><A href="\/senateur\/(.*)\.html">(.*)<\/A><\/li>/Uim', $listeSenateursHtm, $matches2)) {
|
|
foreach($matches2[1] as $i=>$urlDetail) {
|
|
$url=$urlBase.$urlDetail.'.html';
|
|
$nomPrenom=$matches2[2][$i];
|
|
$tabTmp=explode(' ', $nomPrenom);
|
|
$nom=$tabTmp[0];
|
|
$prenom=$tabTmp[1];
|
|
echo "$i\t$libDept\t$nomPrenom\t$nom\t$prenom\t$url".EOL;
|
|
if (count($tabTmp[1])>2) die('Cas non prévu !!!'.EOL);
|
|
$page=getUrl($url, '', '', $referer, false);
|
|
$body=$page['body'];
|
|
$tabUpdate=html2array($body, $tabZonesS);
|
|
$tabUpdate['mandat']='senateur';
|
|
$tabUpdate['nom']=$nom;
|
|
$tabUpdate['prenom']=$prenom;
|
|
$tabUpdate['libDept']=$libDept;
|
|
$tabUpdate['url']=$url;
|
|
$tabUpdate['dateInsert']=date('YmdHis');
|
|
$tabUpdate['dateElection']=WDate::dateT('d M Y','Ymd', $tabUpdate['dateElection']);
|
|
$tabUpdate['dateNaiss']=WDate::dateT('d M Y','Ymd', $tabUpdate['dateNaiss']);
|
|
@$iDb->insert($table, $tabUpdate);
|
|
print_r($tabUpdate);
|
|
//die(mysql_error().EOL);
|
|
//die();
|
|
randsleep(7,21);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
die();
|
|
|
|
|
|
|
|
if (!$modeDetail || $modeRCI) {
|
|
|
|
if (!$modeRCI) {
|
|
$tabSerialised=@file_get_contents(REP_TEMP.$table);
|
|
if (!$tabSerialised) {
|
|
if (!$nbLet)
|
|
//$letIni='AAA';
|
|
for ($l1=65; $l1<91; $l1++) {
|
|
//if ($letIni) $l1=ord(substr($letIni,0,1));
|
|
$L1=chr($l1);
|
|
for ($l2=65; $l2<91; $l2++) {
|
|
//if ($letIni) $l2=ord(substr($letIni,1,1));
|
|
$L2=chr($l2);
|
|
for ($l3=65; $l3<91; $l3++) {
|
|
//if ($letIni) $l3=ord(substr($letIni,2,1));
|
|
$letIni=false;
|
|
$L3=chr($l3);
|
|
$let=''.$L1.$L2.$L3;
|
|
$tabLet[]=$let;
|
|
}
|
|
}
|
|
}
|
|
shuffle($tabLet);
|
|
file_put_contents (REP_TEMP.$table, serialize($tabLet));
|
|
} else
|
|
$tabLet=unserialize($tabSerialised);
|
|
|
|
$nbLet=count($tabLet);
|
|
|
|
echo date ('Y/m/d - H:i:s')." - Il reste $nbLet requêtes à traiter...".EOL;
|
|
} else {
|
|
$an=substr($modeRCI,0,2);
|
|
//if ($an*1<1 || $an>99999) die("Erreur : N° RCI '$modeRCI' incorrect !");
|
|
$lettre=strtoupper(substr($modeRCI,2,1));
|
|
if ($lettre<>'S' && $lettre<>'P') die("Erreur : N° RCI '$modeRCI' incorrect !");
|
|
$numRC=substr($modeRCI,3,5);
|
|
if ($numRC<1 || $numRC>99999) die("Erreur : N° RCI '$modeRCI' incorrect !");
|
|
$tabLet=array(0=>$modeRCI);
|
|
$nbLet=2;
|
|
}
|
|
$nbDone=0;
|
|
foreach ($tabLet as $j=>$let) {
|
|
$nbDone++;
|
|
if ($modeRCI) $url=$urlRecherche."Gratuit.jsp?mille=$an&fj=$lettre&rc=$numRC";
|
|
else $url=$urlRecherche.'RSEGratuit.jsp?rse='.$let;
|
|
$page=getUrl($url, '', '', '', false);
|
|
$body=$page['body'];
|
|
echo date ('Y/m/d - H:i:s')." - $nbDone/$nbLet : RC Monaco '$let' ...";
|
|
if (preg_match_all('/<a href="\.\/fiche(PP|S)Gratuit\.jsp\?rc=(.*)">(.*)<\/a>(?:.*)<td>(.*)<\/td>(?:.*)<td>(.*)<\/td>(?:.*)<td>(.*)<\/td>(?:.*)<td>(.*)<\/td>(?:.*)<\/tr>/Uis', $body, $matches)) {
|
|
foreach($matches[2] as $i=>$numRCI) {
|
|
$tabEntrep=array('codeRCI'=>trim($numRCI),
|
|
'PmPP'=>trim($matches[1][$i]),
|
|
'forme'=>trim($matches[4][$i]),
|
|
'rs'=>trim($matches[5][$i]),
|
|
'etabEns'=>trim($matches[6][$i]),
|
|
'etat'=>trim($matches[7][$i]),
|
|
'dateInsert'=>date('YmdHis'));
|
|
@$iDb->insert($table, $tabEntrep);
|
|
//print_r($tabEntrep);
|
|
}
|
|
$i++;
|
|
echo "$i entreprise(s)";
|
|
} else
|
|
echo "Pas d'entreprise";
|
|
echo EOL;
|
|
if (!$modeRCI) {
|
|
unset($tabLet[$j]);
|
|
|
|
file_put_contents(REP_TEMP.$table, serialize(array_unique($tabLet)));
|
|
if ($modeDebug) randsleep(1,2);
|
|
else {
|
|
randsleep(60,120);
|
|
if (date('H')>19) die();
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if ($modeDetail || $modeRCI) {
|
|
if ($modeRCI)
|
|
$ret=$iDb->select($table, 'id,codeRCI,PmPP', "codeRCI='$modeRCI'", false);
|
|
else
|
|
$ret=$iDb->select($table, 'id,codeRCI,PmPP', "activite IS NULL OR etat='' /*AND PmPP='PP'*/ ORDER BY dateInsert DESC", false);
|
|
$nbLet=count($ret);
|
|
foreach ($ret as $tFiche) {
|
|
@$nbDone++;
|
|
$id=$tFiche['id'];
|
|
$rci=$tFiche['codeRCI'];
|
|
$ppS=$tFiche['PmPP'];
|
|
echo date ('Y/m/d - H:i:s')." - $nbDone/$nbLet : RCI Monaco '$rci' ($id) - '";
|
|
//echo date ('Y/m/d - H:i:s')." - Num Agrément : $agrNum".EOL;
|
|
if ($ppS=='S') $urlFiche=$urlDetailS.$rci;
|
|
elseif ($ppS=='PP') $urlFiche=$urlDetailPP.$rci;
|
|
else die("Type '$ppS' inconnu !".EOL);
|
|
$page=getUrl($urlFiche, '', '', $referer, false);
|
|
$body=$page['body'];
|
|
if ($ppS=='S') $tabZones=$tabZonesS;
|
|
elseif ($ppS=='PP') $tabZones=$tabZonesPP;
|
|
$tabUpdate=html2array($body, $tabZones);
|
|
if (strpos($tabUpdate['ensAdr4'],'Pour de plus amples informations')>0) $tabUpdate['ensAdr4']='';
|
|
$iDb->update($table, $tabUpdate, "id=$id");
|
|
echo $tabUpdate['rs']."'".EOL;
|
|
if (!$modeRCI) {
|
|
if ($modeDebug) randsleep(1,2);//randsleep(7,21);
|
|
else {
|
|
randsleep(60,120);
|
|
if (date('H')>19) die();
|
|
}
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
die();
|
|
|
|
shuffle($tabLet);
|
|
file_put_contents (REP_TEMP.$table, serialize($tabLet));
|
|
die();
|
|
echo count($tabLet);
|
|
//serialize
|
|
print_r($tabLet);
|
|
die();
|
|
|
|
$tabHtml=html2array($body, $tabZones);
|
|
print_r($tabHtml);
|
|
|
|
die();
|
|
|
|
|
|
$ret=$iDb->select($table, 'agrementNum', '1', false);
|
|
foreach ($ret as $tAgrNum) {
|
|
$agrNum=$tAgrNum['agrementNum'];
|
|
echo date ('Y/m/d - H:i:s')." - Num Agrément : $agrNum".EOL;
|
|
}
|
|
|
|
for($i=$iDeb;;$i++) {
|
|
$tabInsert=array('id'=>$i);
|
|
|
|
$url=$urlBase."?NumAgr=$agrNum&DateDeb=&DateFin=&lstTypeDec=0&NomSOc=&action=new&varvalidform=on&hidRagCode=&CodeAMF=&btnvalid.x=13&btnvalid.y=15";
|
|
$page=getUrl($url, '', '', '', false);
|
|
$referer=$url;
|
|
$body=$page['body'];
|
|
print_r($page);
|
|
die();
|
|
|
|
$tabHtml=html2array($body, $tabZones);
|
|
|
|
foreach ($tabHtml as $zone=>$data) {
|
|
if (is_array($data)) {
|
|
foreach ($data as $j=>$tabInsert2) {
|
|
$tabInsert2['id']=$i;
|
|
$tabInsert2['num']=$j;
|
|
$tabInsert2['dateInsert']=date('YmdHis');
|
|
if (!$iDb->insert($table.$zone, $tabInsert2, false)) {
|
|
if (mysql_errno()==1062) {
|
|
unset($tabInsert2['dateInsert']);
|
|
$iDb->update($table.$zone, $tabInsert2, "id=$i AND num=$j", false);
|
|
} else {
|
|
print_r($tabInsert2);
|
|
die(mysql_error());
|
|
}
|
|
}
|
|
}
|
|
} else
|
|
$tabInsert[$zone]=$data;
|
|
}
|
|
if (count($tabInsert)<10 || $tabInsert['raiSoc']=='') {
|
|
$nbVides++;
|
|
if ($nbVides<20) continue;
|
|
else {
|
|
echo date('Y/m/d - H:i:s')." - $i : Il semble que le dernier numéro '$table' attribué soit le ".($i-$nbVides).EOL;
|
|
die();
|
|
}
|
|
}
|
|
$nbVides=0;
|
|
|
|
unset($tabInsert['qualifs']);
|
|
|
|
if (isset($tabInsert['urlImg']) && $tabInsert['urlImg']<>'') {
|
|
$urlLogo='http://www.qualifelec.fr/';
|
|
die($tabInsert['urlImg']);
|
|
if (substr($urlLogo,0,32)=='http://www.annuaire.com/uploads/') {
|
|
$extension=substr(strrchr($urlLogo,'.'),1);
|
|
if (!file_exists("/home/data/logos/$siren.$extension")) {
|
|
$referer='';
|
|
$tDeb=microtime(true);
|
|
$page=getUrl($urlLogo, '', '', $referer, false, '', '', 60);
|
|
$duree=round(microtime(true)-$tDeb,3);
|
|
$body=$page['body'];
|
|
$taille=round(strlen($body)/1024,1);
|
|
file_put_contents("/home/data/logos/$siren.$extension",$body);
|
|
}
|
|
}
|
|
}
|
|
unset($tabInsert['urlImg']);
|
|
$tabInsert['dateInsert']=date('YmdHis');
|
|
if (!$iDb->insert($table, $tabInsert, false)) {
|
|
if (mysql_errno()==1062) {
|
|
unset($tabInsert['dateInsert']);
|
|
$iDb->update($table, $tabInsert, "id=$i", false);
|
|
} else {
|
|
print_r($tabInsert);
|
|
die(mysql_error());
|
|
}
|
|
}
|
|
|
|
//die();
|
|
$nb=count($tabInsert);
|
|
$nb2=@count($tabInsert2);
|
|
echo date('Y/m/d H:i:s')." - $i : $nb zones et $nb2 qualifications".EOL;
|
|
|
|
if ($modeDebug) randsleep(1,2);
|
|
else randsleep(7,21);
|
|
}
|
|
|
|
die();
|
|
|
|
?>
|