2012-10-16 07:44:31 +00:00
#!/usr/bin/php -c/var/www/batch/config/php_batch_sd.ini
< ? php
include_once ( FWK_PATH . 'common/chiffres.php' );
include_once ( FWK_PATH . 'common/dates.php' );
include_once ( INCLUDE_PATH . 'insee/classMInsee.php' );
include_once ( INCLUDE_PATH . 'partenaires/classMGreffes.php' );
include_once ( INCLUDE_PATH . 'partenaires/classMTel.php' );
include_once ( INCLUDE_PATH . 'partenaires/classMMap.php' );
$iGreffe = new MGreffes ();
$iInsee = new MInsee ();
$iTel = new MTel ();
/*
$tabSiren = array (
" 384814307 " ,
" 500565528 " ,
" 493650949 " ,
); */
$tabLignes = file ( '/root/sql/test-siret-pjms.csv' );
$nb = count ( $tabLignes );
2013-06-19 08:24:49 +00:00
echo 'Siret;Si<53> ge;Raison Sociale;Raison Sociale (suite);Enseigne;N<> et Nom de Rue;Compl<70> ment adresse;CP;Ville;Code d<> part INSEE;Code Commune INSEE;Cat<61> gorie juridique;Libell<6C> CJ;Code Naf;Libell<6C> Naf;Latitude;Longitude;Pr<50> cision;lambert X;lambert Y;Tranche effectif <20> tablissement;Tranche effectif entreprise;effectif bilan;date de cr<63> ation <20> tablissement;date de cr<63> ation entreprise;nb <20> tablissements actifs;Capital;Capital Devise;Ch Aff;tel;fax' . EOL ;
2012-10-16 07:44:31 +00:00
$fp = fopen ( '/root/sql/test-pjms-telfax.csv' , 'a' );
foreach ( $tabLignes as $i => $ligne ) {
//if ($i<397672) continue;
$j = $i + 1 ;
$tabCol = explode ( " , " , $ligne );
$siren = $tabCol [ 0 ] * 1 ;
$nic = $tabCol [ 1 ] * 1 ;
if ( $siren > 0 ) {
/* $tabTel =@ $iTel -> getTel ( $siren );
print_r ( $tabTel );
*/
$tabId =@ $iInsee -> getIdentiteEntreprise ( $siren , $nic );
$tel = chunk_split ( strtr ( $tabId [ 'Tel' ], array ( '.' => '' , ' ' => '' , '-' => '' , '/' => '' , ',' => '' )), 2 , '.' );
$fax = chunk_split ( strtr ( $tabId [ 'Fax' ], array ( '.' => '' , ' ' => '' , '-' => '' , '/' => '' , ',' => '' )), 2 , '.' );
fwrite ( $fp , $tabId [ 'Siret' ] . " ; $tel ; $fax " . EOL );
echo $tabId [ 'Siret' ] . " ; $tel ; $fax " . EOL ;
/*
$tabLambert = geos2lambert ( $tabId [ 'GeoLat' ], $tabId [ 'GeoLon' ]);
@ fwrite ( $fp , $tabId [ 'Siret' ] . ';' .
$tabId [ 'Siege' ] . ';' .
$tabId [ 'Nom' ] . ';' .
$tabId [ 'Nom2' ] . ';' .
$tabId [ 'Enseigne' ] . ';' .
$tabId [ 'Adresse' ] . ';' .
$tabId [ 'Adresse2' ] . ';' .
$tabId [ 'CP' ] . ';' .
$tabId [ 'Ville' ] . ';' .
$tabId [ 'Dept' ] . ';' .
$tabId [ 'codeCommune' ] . ';' .
$tabId [ 'FJ' ] . ';' .
$tabId [ 'FJ_lib' ] . ';' .
$tabId [ 'NafEtab' ] . ';' .
$tabId [ 'NafEtabLib' ] . ';' .
$tabId [ 'GeoLat' ] . ';' .
$tabId [ 'GeoLon' ] . ';' .
$tabId [ 'GeoPrecis' ] . ';' .
$tabLambert [ 'x_l2e' ] . ';' .
$tabLambert [ 'y_l2e' ] . ';' .
$tabId [ 'EffEtTr' ] . ';' .
$tabId [ 'EffEnTr' ] . ';' .
$tabId [ 'bilanYP' ] . ';' .
$tabId [ 'DateCreaEt' ] . ';' .
$tabId [ 'DateCreaEn' ] . ';' .
$tabId [ 'NbEtab' ] . ';' .
$tabId [ 'Capital' ] . ';' .
$tabId [ 'CapitalDev' ] . ';' .
$tabId [ 'bilanFL' ] . " ; $tel ; $fax " . EOL );
echo " $i / $nb : " . $tabId [ 'Siret' ] . " \t " . $tabId [ 'GeoLat' ] . ',' . $tabId [ 'GeoLon' ] . ' (' . $tabId [ 'Adresse' ] . ', ' . $tabId [ 'CP' ] . ' ' . $tabId [ 'Ville' ] . ':' . $tabId [ 'GeoPrecis' ] . " ) \t " . $tabLambert [ 'x_l2e' ] . ',' . $tabLambert [ 'y_l2e' ] . " \t " . $tabId [ 'Nom' ] . EOL ;
//randsleep(7,21);
//die()*/
usleep ( 100 );
}
}
fclose ( $fp );
die ();
$tabSiren = array (
" 999171994 " ,
" 999979248 " ,
);
shuffle ( $tabSiren );
foreach ( $tabSiren as $i => $siren ) {
$strValide = 'invalide !' ;
if ( $iInsee -> valideSiren ( $siren )) {
$strValide = 'VALIDE...' ;
print_r ( $iGreffe -> getIdentite ( $siren ));
randsleep ( 7 , 21 );
}
echo " $i : $siren $strValide " . EOL ;
//findSiteWeb($siren);
//die();
}
die ();
/*
$tabAdresses = file ( './adresse.txt' );
$fp = fopen ( './adresses.csv' , 'w' );
foreach ( $tabAdresses as $ligne ) {
$adr = $iInsee -> structureVoie ( $ligne );
$adrComp =@ trim ( $adr [ 'adrComp0' ] . ' ' . $adr [ 'adrComp1' ] . ' ' . $adr [ 'adrComp2' ]);
@ fwrite ( $fp , '' . $adr [ 'num' ] . ';' . $adr [ 'indRep' ] . ';' . $adr [ 'typeVoie' ] . ';' . $adr [ 'libVoie' ] . ';' . $adr [ 'cp' ] . ';' . $adr [ 'ville' ] . ';' . $adrComp . EOL );
// fgets(STDIN);
}
fclose ( $fp ); */
$strNafCpf = utf8_decode ( file_get_contents ( '/root/NAFrev2CPF2008.txt' ));
2013-06-19 08:24:49 +00:00
preg_match_all ( '/([0-9]{2,2}\.[0-9]{2,2}[A-Z]{1,1})(?:.*)(Cette sous-classe comprend \:.*)(Produits associ<63> s \:.*\n\n)/Uis' , $strNafCpf , $matches );
2012-10-16 07:44:31 +00:00
//(.*)
$tabNaf = $matches [ 1 ];
$tabTxt = $matches [ 2 ];
$tabCpf = $matches [ 3 ];
//print_r($matches[4]);
foreach ( $tabNaf as $i => $naf ) {
2013-06-19 08:24:49 +00:00
$strCpf = trim ( preg_replace ( '/ +/' , ' ' , strtr ( strtoupper ( strtr ( $tabCpf [ $i ], array ( 'Produits associ<63> s :' => '' , " \r " => ' ' , " \n " => ' ' ))), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ.<2E> <> -' , ' ' )));
2012-10-16 07:44:31 +00:00
$tabCpf2 = explode ( ',' , $strCpf );
$tabCpf3 = array ();
foreach ( $tabCpf2 as $cpf )
$tabCpf3 [] = strtr ( $cpf , array ( ' ' => '' ));
echo " $i : " . $tabNaf [ $i ] . " \t " . implode ( ',' , $tabCpf3 ) . EOL ;
}
die ();
/* 36.00 Z Captage , traitement et distribution d ' eau
Cette sous - classe comprend aussi :
- l 'exploitation de canaux d' irrigation
2013-06-19 08:24:49 +00:00
- l 'exploitation d' appareils d ' irrigation <EFBFBD> des fins agricoles ( cf . 01.61 Z )
2012-10-16 07:44:31 +00:00
2013-06-19 08:24:49 +00:00
- le traitement des eaux us<EFBFBD> es <EFBFBD> des fins de pr<EFBFBD> vention de la pollution ( cf . 37.00 Z )
2012-10-16 07:44:31 +00:00
- le transport ( sur de longues distances ) d ' eau par conduite ( cf . 49.50 Z )
Cette sous - classe comprend :
2013-06-19 08:24:49 +00:00
- le captage d ' eau <EFBFBD> partir de rivi<EFBFBD> res , de lacs et de puits , etc .
2012-10-16 07:44:31 +00:00
- la collecte d ' eau de pluie
- le traitement de l 'eau aux fins de la distribution d' eau
- le traitement de l ' eau pour des usages industriels ou autres
- le dessalement de l 'eau de mer ou d' eaux souterraines , pour autant que la production d ' eau
2013-06-19 08:24:49 +00:00
constitue l ' activit<EFBFBD> principale
2012-10-16 07:44:31 +00:00
- la distribution de l ' eau par conduites , camions ou autres moyens de transport
36.00 . 11 , 36.00 . 12 , 36.00 . 20 , 36.00 . 30 */
$siret = '51462335400016' ;
$siret = '55214450300018' ;
$siret = '51462323000018' ; // Aucun siret sur CCI
$siret = '064410863' ;
$siret = '56850374200013' ; // Alsaceeco
$siret = '49496793800031' ;
$siret = '49205135400024' ;
$siret = '91732031900010' ;
$url = 'http://www.aef.cci.fr/accueil' ;
$page = getUrl ( $url );
if ( $page [ 'code' ] <> 200 ) die ( 'A Erreur HTTP ' . $page [ 'code' ] . " sur $url " );
$cookies = $page [ 'header' ][ 'Set-Cookie' ];
$referer = $url ;
$url = 'http://www.aef.cci.fr/accueil/accueil/validerFormulaire' ;
$post = array ( 'label' => '' ,
'identifiant' => $siret ,
'nom' => '' ,
'dep' => '' ,
);
usleep ( rand ( 300 , 1000 ));
$page = getUrl ( $url , $cookies , $post , $referer );
$referer = $url ;
//$cookies=$page['header']['Set-Cookie'];
if ( $page [ 'code' ] <> 302 ) die ( 'B Erreur HTTP ' . $page [ 'code' ] . " sur $url " );
$url = $page [ 'header' ][ 'Location' ];
$page = getUrl ( $url , $cookies , '' , $referer );
$referer = trim ( $url );
//$pageJS=getUrl('http://www.aef.cci.fr/scripts/Marqueur.js', $cookies, '', $referer);
//die(print_r($pageJS));
if ( ! preg_match_all ( '/<strong><a href="ficheEntreprise\?siret=(.*)">(.*)<\/a>(?:.*)<\/strong>/Uis' , $page [ 'body' ], $matches ))
die ( " C Erreur : Aucun siret $siret sur $url " );
else {
if ( strlen ( $siren ) < 14 && count ( $matches [ 1 ]) == 1 )
$siret = $matches [ 1 ][ 0 ];
else {
echo " $siret $url $cookies : Choisir le bon nic ! " . EOL ;
print_r ( $matches );
die ();
}
}
// http://www.aef.cci.fr/accueil/listeEntreprises/ficheEntreprise?siret=
$url = 'http://www.aef.cci.fr/accueil/listeEntreprises/ficheEntreprise?siret=' . $siret ;
usleep ( rand ( 200 , 900 ));
$strCookies = str_replace ( " Path=/ \r \n " , '' , $cookies );
echo " $siret $url ( $referer ) $strCookies $cookies " . EOL ;
$page = getUrl ( $url , $strCookies , '' , $referer );
$body = $page [ 'body' ];
$tabRet = array ();
$lienCCI = trim ( htm2txt ( @ getTextInHtml ( $body , '<p class="ficheCCI">' , 'href' , '</p>' )));
$tmp = explode ( '"' , $lienCCI );
$tabRet [ 'nomCCI' ] = str_replace ( '>' , '' , end ( $tmp ));
$tabRet [ 'urlCCI' ] = $tmp [ 1 ];
$tabRet [ 'siret' ] = trim ( htm2txt ( @ getTextInHtml ( $body , '<dt>SIRET</dt>' , '<dd>' , '</dd>' )));
$tabRet [ 'enseigne' ] = utf8_decode ( trim ( htm2txt ( @ getTextInHtml ( $body , '<dt>Enseigne</dt>' , '<dd>' , '</dd>' ))));
$tabRet [ 'statut' ] = utf8_decode ( trim ( htm2txt ( @ getTextInHtml ( $body , '<dt>Statut</dt>' , '<dd>' , '</dd>' ))));
2013-06-19 08:24:49 +00:00
$tabRet [ 'categorie' ] = utf8_decode ( trim ( htm2txt ( @ getTextInHtml ( $body , '<dt>Catégorie</dt>' , '<dd>' , '</dd>' ))));
2012-10-16 07:44:31 +00:00
$tabRet [ 'voie' ] = utf8_decode ( trim ( htm2txt ( @ getTextInHtml ( $body , '<dt>Voie </dt>' , '<dd>' , '</dd>' ))));
$tabRet [ 'bp' ] = trim ( htm2txt ( @ getTextInHtml ( $body , ' <dt>Boite postale </dt>' , '<dd>' , '</dd>' )));
$tabRet [ 'cp' ] = trim ( htm2txt ( @ getTextInHtml ( $body , '<dt>Code postal </dt>' , '<dd>' , '</dd>' )));
$tabRet [ 'ville' ] = utf8_decode ( trim ( htm2txt ( @ getTextInHtml ( $body , '<dt>Ville</dt>' , '<dd>' , '</dd>' ))));
$tabRet [ 'pays' ] = trim ( htm2txt ( @ getTextInHtml ( $body , '<dt>Pays </dt>' , '<dd>' , '</dd>' )));
2013-06-19 08:24:49 +00:00
$tabRet [ 'tel' ] = trim ( htm2txt ( @ getTextInHtml ( $body , '<dt>Téléphone </dt>' , '<dd>' , '</dd>' )));
$tabRet [ 'fax' ] = trim ( htm2txt ( @ getTextInHtml ( $body , '<dt>Télécopie </dt>' , '<dd>' , '</dd>' )));
2012-10-16 07:44:31 +00:00
$tabRet [ 'web' ] = trim ( htm2txt ( @ getTextInHtml ( $body , 'Site internet' , 'href' , '</dd>' )));
2013-06-19 08:24:49 +00:00
$tabRet [ 'dateDebut' ] = trim ( htm2txt ( @ getTextInHtml ( $body , 'Date de début d\'activité' , '<dd>' , '</dd>' )));
2012-10-16 07:44:31 +00:00
$tabRet [ 'naf' ] = trim ( htm2txt ( @ getTextInHtml ( $body , '<dt>Code NAF 2008' , '<dd>' , '</dd>' )));
2013-06-19 08:24:49 +00:00
$tabRet [ 'activite' ] = utf8_decode ( trim ( htm2txt ( @ getTextInHtml ( $body , 'Activité en clair' , '<dd>' , '</dd>' ))));
2012-10-16 07:44:31 +00:00
$tabRet [ 'siren' ] = trim ( htm2txt ( @ getTextInHtml ( $body , '<dt>SIREN</dt>' , '<dd>' , '</dd>' )));
$tabRet [ 'rs' ] = utf8_decode ( trim ( htm2txt ( @ getTextInHtml ( $body , ' <dt>Raison sociale</dt>' , '<dd>' , '</dd>' ))));
2013-06-19 08:24:49 +00:00
$tabRet [ 'denom' ] = utf8_decode ( trim ( htm2txt ( @ getTextInHtml ( $body , '<dt>Dénomination </dt>' , '<dd>' , '</dd>' ))));
2012-10-16 07:44:31 +00:00
$tabRet [ 'fj' ] = utf8_decode ( trim ( htm2txt ( @ getTextInHtml ( $body , 'Forme juridique' , '<dd>' , '</dd>' ))));
//$tabRet['lien']=$lienCCI;
$tabRet [ 'diropp' ] = utf8_decode ( trim ( htm2txt ( @ getTextInHtml ( $body , 'quipe dirigeante</h3>' , '<dl class="fiche">' , '</div>' ))));
$tabRet [ 'dirleg' ] = utf8_decode ( trim ( htm2txt ( @ getTextInHtml ( $body , '<h4>Dirigeants</h4>' , '<dl class="fiche">' , '</div>' ))));
$tabRet [ 'capitalDev' ] = trim ( htm2txt ( @ getTextInHtml ( $body , 'Capital social (' , ' (' , '</dt>' )));
$tabRet [ 'capitalMt' ] = trim ( htm2txt ( @ getTextInHtml ( $body , 'Capital social (' , '<dd>' , '</dd>' )));
print_r ( $tabRet );
//print_r($page);
die ();
2013-06-19 08:24:49 +00:00
/* 11 : 26 : 17.340 [ 239 ms ][ total 816 ms ] <EFBFBD> tat: 200 [ OK ]
2012-10-16 07:44:31 +00:00
GET http :// www . aef . cci . fr / accueil / listeEntreprises / ficheEntreprise ? siret = 51462335400016 Indicateurs chargement [ LOAD_DOCUMENT_URI LOAD_INITIAL_DOCUMENT_URI ] Taille contenu [ 14043 ] Type Mime [ text / html ]
2013-06-19 08:24:49 +00:00
En - t<EFBFBD> tes requ<EFBFBD> te :
2012-10-16 07:44:31 +00:00
Host [ www . aef . cci . fr ]
User - Agent [ Mozilla / 5.0 ( Windows ; U ; Windows NT 5.1 ; fr ; rv : 1.9 . 1.3 ) Gecko / 20090824 Firefox / 3.5 . 3 ( . NET CLR 3.5 . 30729 ) FirePHP / 0.3 ]
Accept - Language [ fr , fr - fr ; q = 0.8 , en - us ; q = 0.5 , en ; q = 0.3 ]
Accept - Encoding [ gzip , deflate ]
Accept - Charset [ ISO - 8859 - 1 , utf - 8 ; q = 0.7 , * ; q = 0.7 ]
Keep - Alive [ 300 ]
Connection [ keep - alive ]
Referer [ http :// www . aef . cci . fr / accueil / listeEntreprises ]
Cookie [ _ZopeId = " 37588307A4Cwlzf3Lu4 " ; SERVERID = zope5 ; __utma = 67492737.1045109818 . 1252916446.1252916446 . 1252916446.1 ; __utmb = 67492737.50 . 10.1252916446 ; __utmc = 67492737 ; __utmz = 67492737.1252916446 . 1.1 . utmgclid = CKnsnYPW8JwCFZkA4wodMjdujQ | utmccn = ( not % 20 set ) | utmcmd = ( not % 20 set ) | utmctr = registre % 20 des % 20 metiers ]
2013-06-19 08:24:49 +00:00
En - t<EFBFBD> tes r<EFBFBD> ponse :
2012-10-16 07:44:31 +00:00
Date [ Mon , 14 Sep 2009 09 : 28 : 45 GMT ]
Server [ Zope / ( Zope 2.9 . 0 - , python 2.4 . 3 , linux2 ) ZServer / 1.1 ]
Content - Length [ 14043 ]
Content - Type [ text / html ; charset = utf - 8 ]
Cache - Control [ no - cache ]
Connection [ close ]
**/
function chechSirenSiteAfnic ( $siteWeb ) {
$referer = 'http://www.afnic.fr/' ;
$url = 'http://www.afnic.fr/outils/whois/' . $siteWeb ;
//die(print_r(parse_url('http://www.'.$siteWeb.'/')));
/* print_r ( pathinfo ( 'http://www.' . $siteWeb . '/index.html' ));
print_r ( pathinfo ( $siteWeb ));
die (); */
$page = getUrl ( $url , '' , '' , $referer );
if ( $page [ 'code' ] == 200 ) {
2013-06-19 08:24:49 +00:00
if ( preg_match ( '/<li>Identifi<66> gr<67> ce au num<75> ro de SIREN <a href="(?:.*) target="_blank" class="ext">(.*)<\/a><\/li>/Uis' , $page [ 'body' ], $matches ))
2012-10-16 07:44:31 +00:00
echo html_entity_decode ( $matches [ 1 ]);
die ();
}
}
chechSirenSiteAfnic ( 'dipinfo.fr' );
function findSiteWeb ( $siren )
{
$referer = 'http://www.google.fr/' ;
$siren2 = implode ( ' ' , str_split ( $siren , 3 ));
$rs = " $siren OR \" $siren2\ " - site : gouv . fr - site : info - financiere . fr - site : bodacc . fr - site : manageo . fr - site : bilansgratuits . fr - site : lesechos . fr - site : google . fr " ;
$url = 'http://www.google.fr/search?hl=fr&as_qdr=all&q=' . urlencode ( $rs ) . '&btnG=Rechercher' ; //&meta=cr%3DcountryFR';
//http://www.google.fr/search?hl=fr&q=448364232+OR+%22448+364+232%22+-site%3Agouv.fr++-site%3Ainfo-financiere.fr+-site%3Abodacc.fr&btnG=Rechercher&meta=
$page = getUrl ( $url , '' , '' , $referer , false , 'www.google.fr' , '' , 5 );
//$fp=fopen('./findSiteWeb.log', 'a+');
if ( $page [ 'code' ] == 200 ) {
$levMin = 100 ;
$pctMin = 0 ;
$urlLev = $urlPct = '' ;
$body = $page [ 'body' ];
preg_match_all ( '/<a href="(.*)"/iU' , $body , $matches );
$urlapprox = " http:// $rs . " ;
foreach ( $matches [ 1 ] as $i => $url )
{
/* $lev =@ levenshtein ( $urlapprox , $url );
if ( $lev > 0 && $lev < $levMin ) {
$levMin = $lev ;
$urlLev = $url ;
}
$sim = similar_text ( $urlapprox , $url , $pct );
if ( $pct > $pctMin && strpos ( $url , 'zonebourse' ) === false ) {
$pctMin = $pct ;
$urlPct = $url ;
}
fwrite ( $fp , date ( 'Y-m-d H:i:s' ) . ' - ' . $page [ 'code' ] . " - $rs - $i - $lev (Min= $levMin ) - $pct (Min= $pctMin ) - $urlLev - $urlPct - $url\n " ); */
if ( $i >= 10 && preg_match ( '/^http\:\/\//is' , $url ) && ! preg_match ( '/google/is' , $url ))
echo " $i : $siren = $url " . EOL ; //2008-05-20 20:01:08 - 200- ARKEMA FRANCE - 53 - 20 - 10 - http://www.arkema.fr/ - http://www.zonebourse.com/ARKEMA-17031/ - /intl/fr/about.html
}
//fclose($fp);
/* if ( $levMin < 15 && $pctMin > 44 && $urlLev == $urlPct ) {
fwrite ( $fp , date ( 'Y-m-d H:i:s' ) . ' - ' . $page [ 'code' ] . " - $rs - $i - $lev (Min= $levMin ) - $pct (Min= $pctMin ) - $urlLev - $urlPct - $url !!! RETURNED !!! \n " );
return $urlLev ;
}
return false ; */
}
//fclose($fp);
return false ;
}
//die();
/*
print_r ( $iGreffe -> getListeEtab ( 552144503 ));
//print_r($iGreffe->getIdentite(552144503));
die ();
*/
//print_r($iGreffe->getIdentite('323972596'));
//print_r($iGreffe->getIdentite('552144503'));
?>