2012-10-16 10:08:03 +02:00
< ?
//include_once('/var/www/default/_includes/timer.php');
/** Parse une page Html et retourne son contenu dans un tableau :
** " code " => Code r<EFBFBD> ponse Serveur
** " header " => Headers du serveur
** " body " => Page HTML
**/
function parse_response ( $this_response ) {
// Split response into header and body sections
list ( $response_headers , $response_body ) = explode ( " \r \n \r \n " , $this_response , 2 );
$response_header_lines = explode ( " \r \n " , $response_headers );
// First line of headers is the HTTP response code
$http_response_line = array_shift ( $response_header_lines );
if ( preg_match ( '@^HTTP/[0-9]\.[0-9] ([0-9]{3})@' , $http_response_line , $matches )) { $response_code = $matches [ 1 ]; }
// put the rest of the headers in an array
$response_header_array = array ();
$nbRMID = 0 ;
foreach ( $response_header_lines as $header_line )
{
list ( $header , $value ) = explode ( ': ' , $header_line , 2 );
if ( $header == 'Set-cookie' && substr ( $value , 0 , 5 ) == 'RMID=' && $nbRMID < 5 ) //{
$nbRMID ++ ;
// echo ("Je gicle le RMID n<> $nbRMID\r\n");}
else
@ $response_header_array [ $header ] .= $value . " \n " ;
}
return array ( 'code' => $response_code , 'header' => $response_header_array , 'body' => $response_body );
}
/** R<EFBFBD> cup<EFBFBD> re une page HTML en fonction des param<EFBFBD> tres :
** $url Url distante de la page <EFBFBD> r<EFBFBD> cup<EFBFBD> rer
** $strCookies Chaine de caract<EFBFBD> re contenant les cookies
** $postData Tableau des donn<EFBFBD> es <EFBFBD> passer en POST uniquement
** $referer Referer <EFBFBD> indiquer lors de l ' appel de la page
** $debug Activer le d<EFBFBD> bogage ( True / False )
**
** ... et retourne son contenu dans un tableau :
** " code " => Code r<EFBFBD> ponse Serveur
** " header " => Headers du serveur
** " body " => Page HTML
**/
function getUrl ( $url , $strCookies = '' , $postData = '' , $referer = '' , $debug = false , $host = '' , $proxy = '' , $timeout = 0 , $nbRetry = 0 ) {
$ch = curl_init ();
if ( $host == '' )
$this_header = array ( 'Host: ' . parse_url ( $url , PHP_URL_HOST ));
else
$this_header = array ( 'Host: ' . $host );
curl_setopt ( $ch , CURLOPT_URL , $url );
curl_setopt ( $ch , CURLOPT_HEADER , 1 );
if ( $proxy <> '' ) curl_setopt ( $ch , CURLOPT_PROXY , $proxy );
//curl_setopt($ch, CURLOPT_PROXYUSERPWD, 'username:password'); // Pas n<> cessaire en authentification NT
if ((( int ) $timeout ) <> 0 ) {
curl_setopt ( $ch , CURLOPT_TIMEOUT , ( int ) $timeout );
curl_setopt ( $ch , CURLOPT_CONNECTTIMEOUT , ( int ) $timeout );
}
curl_setopt ( $ch , CURLOPT_RETURNTRANSFER , 1 );
//curl_setopt($ch, CURLOPT_BINARYTRANSFER, 1); // New
//curl_setopt($ch, CURLOPT_FRESH_CONNECT, 1);
//curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
//curl_setopt($ch, CURLOPT_MAXREDIRS, 1);
/* curl_setopt ( $ch , CURLOPT_WRITEFUNCTION , receiveResponse );
function receiveResponse ( $curlHandle , $xmldata )
{
$this -> responseString = $xmldata ;
$this -> responseXML .= $this -> responseString ;
$this -> length = strlen ( $xmldata );
$this -> size += $this -> length ;
return $this -> length ;
}
*/
if ( preg_match ( '/^https/i' , $url ))
curl_setopt ( $ch , CURLOPT_SSL_VERIFYPEER , false );
$user_agent = 'Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)' ;
curl_setopt ( $ch , CURLOPT_USERAGENT , $user_agent );
curl_setopt ( $ch , CURLOPT_REFERER , $referer );
// Add each cookie that has been returned in the response
// If cookies need to be added/deleted or value changed, then add code here
if ( $strCookies != '' ) {
//die('"'.$strCookies.'"');
//echo $strCookies."\r\n";
$cookies = explode ( " \n " , $strCookies );
// Create the basic header
foreach ( $cookies as $this_cookie ) {
if ( trim ( $this_cookie ) <> '' )
array_push ( $this_header , 'Cookie: ' . $this_cookie );
}
}
if ( $postData != '' ) {
if ( is_array ( $postData ))
$post_data = $postData ;
$o = " " ;
foreach ( $post_data as $k => $v )
{
$o .= " $k = " . utf8_encode ( $v ) . " & " ;
}
$post_data = substr ( $o , 0 , - 1 );
curl_setopt ( $ch , CURLOPT_POST , 1 );
curl_setopt ( $ch , CURLOPT_POSTFIELDS , $post_data );
//if in_array('',$this_header
/* array_push ( $this_header , " Content-type: application/x-www-form-urlencoded " );
array_push ( $this_header , " Content-Length: 44 " ); */
}
/* curl_setopt ( $ch , CURLOPT_STDERR , '/var/www/log/curlerror.log' );
curl_setopt ( $ch , CURLOPT_VERBOSE , true );
*/
curl_setopt ( $ch , CURLOPT_HTTPHEADER , $this_header );
//print_r($this_header);
if ( $nbRetry == 0 ) {
$page = curl_exec ( $ch );
if ( $page === false ) {
if ( curl_errno ( $ch ) == 28 ) //TIMEOUT
$response = array ( 'code' => 408 , 'header' => array ( 'curl_errno' => curl_errno ( $ch ), 'curl_error' => curl_error ( $ch )), 'body' => 'Connexion impossible au site du partenaire/Timeout' );
else
$response = array ( 'code' => 400 , 'header' => array ( 'curl_errno' => curl_errno ( $ch ), 'curl_error' => curl_error ( $ch )), 'body' => 'Erreur Curl : ' . curl_error ( $ch ));
} else
$response = parse_response ( $page );
} else {
$numTry = 0 ;
while ( $numTry <= $nbRetry ) {
$page = curl_exec ( $ch );
if ( $page === false ) {
if ( curl_errno ( $ch ) == 28 ) //TIMEOUT
$response = array ( 'code' => 408 , 'header' => array ( 'curl_errno' => curl_errno ( $ch ), 'curl_error' => curl_error ( $ch )), 'body' => 'Connexion impossible au site du partenaire' );
else
$response = array ( 'code' => 400 , 'header' => array ( 'curl_errno' => curl_errno ( $ch ), 'curl_error' => curl_error ( $ch )), 'body' => 'Erreur Curl : ' . curl_error ( $ch ));
} else {
$response = parse_response ( $page );
break ;
}
}
}
if ( $debug ){
$url2 = str_replace ( 'http://' , '' , $url );
$url2 = str_replace ( '/' , '_' , $url2 );
$url2 = str_replace ( '?' , '(param)' , $url2 );
$url2 = str_replace ( '&' , '(et)' , $url2 );
$fp = fopen ( '/tmp/curl-' . date ( 'Ymd-His' ) . '-' . microtime_float ( true ) . '-' . $url2 . '.html' , 'a' );
fwrite ( $fp , $url . " \r \n " );
fwrite ( $fp , $page );
fclose ( $fp );
//echo strip_tags(html_entity_decode($response['body']), '<td>');
}
//print_r(curl_getinfo($ch));
curl_close ( $ch );
return $response ;
}
/** Recherche un texte dans une page HTML
**
**/
function getTextInHtml ( $pageHtml , $strToFind , $strDeb , $strEnd , $include_strDeb = false , $include_strEnd = false , $ltrim = true , $rtrim = true , & $fin , $nbOcc = 1 ) {
$tabRet = array ();
$deb = $nbOccTrouve = 0 ;
while ( is_int (( $deb = strpos ( $pageHtml , $strToFind , $fin ))) ) {
$deb ++ ;
$deb2 = strpos ( $pageHtml , $strDeb , $deb );
$fin = strpos ( $pageHtml , $strEnd , $deb2 );
if ( ! $include_strDeb )
$deb2 += strlen ( $strDeb );
$s_temp = substr ( $pageHtml , $deb2 , ( $fin - $deb2 ));
if ( $ltrim ) $s_temp = ltrim ( $s_temp );
if ( $rtrim ) $s_temp = rtrim ( $s_temp );
if ( $nbOcc == 1 ) return $s_temp ;
//echo $s_temp."\r\n";
//$a_temp = explode('" class="basic">', $s_temp);
$tabUrl [ $nbOccTrouve ] = $s_temp ;
$nbOccTrouve ++ ;
if ( $nbOcc == $nbOccTrouve ) {
// echo "j'ai trouv<75> le nb demand<6E> , je sort\r\n";
break ;
};
}
return $tabUrl ;
/*< span class = " mongrasvert " >
< li > Le type de voie a <EFBFBD> t<EFBFBD> modifi<EFBFBD> < br >
< li > L ' orthographe du mot directeur a <EFBFBD> t<EFBFBD> modifi<EFBFBD> e < br >
< li > Le code postal a <EFBFBD> t<EFBFBD> forc<EFBFBD> <EFBFBD> partir du d<EFBFBD> partement et de la localit<EFBFBD> < br > </ span >
*/
}
function simpleWhois ( $domain , $tdl , $debug = false ) {
// Fonction de traitement
// -----------------------------------------------
// 0 => Info, le nom est pris
// 1 => Info, le nom est libre
// 2 => Info, le nom est en pending
// 3 => Avertissement, trop de requ<71> tes
// 4 => Erreur, il faut sp<73> cifier une chaine <20> rechercher (regexp)
// 5 => Erreur, la requ<71> te retourn<72> e <20> tait vide
// -----------------------------------------------
// Informations sp<73> cifiques aux extensions
$info = array (
'fr' => array (
'host' => 'whois.nic.fr' ,
'regexp' => 'No[s]*entries[s]*found' ,
'pending' => 'status[s]*:[s]*REDEMPTION'
),
'com' => array (
'host' => 'whois.crsnic.net' ,
'regexp' => 'No[s]*match[s]*for'
),
'net' => array (
'host' => 'whois.crsnic.net' ,
'regexp' => 'No[s]*match[s]*for'
),
);
// Initialisation de la sortie
$output = '' ;
// Initialisation de la requ<71> te
$req = fsockopen ( $info [ $tdl ][ 'host' ], 43 , $errno , $errstr , 5 ); // le 5 permet de stopper la requete si aucune r<> ponse au bout de 5 secondes
// R<> cup<75> ration de la requ<71> te
if ( $req ){
fputs ( $req , $domain . '.' . $tdl . " rn " );
while ( ! feof ( $req )) $output .= fgets ( $req , 4096 );
fclose ( $req );
} else unset ( $req );
// mode debug
if ( $debug ) return $output ;
// Sortie vide
if ( empty ( $output )) return 5 ;
// Trop de requ<71> tes
if ( preg_match ( " /(Too[s]+many[s]+requests|Your[s]+connection[s]+limit[s]+exceeded|daily[s]+whois[s]+limit[s]+exceeded|Maximum[s]+queries|WHOIS[s]+LIMIT[s]+EXCEEDED|referral[s]+host[s]+not[s]+responding|Excessive[s]+querying)/i " , $output )) return 3 ;
// Pending
if ( isset ( $info [ $tdl ][ 'pending' ]) && ! empty ( $info [ $tdl ][ 'pending' ]) && preg_match ( " / " . $info [ $tdl ][ 'pending' ] . " /i " , $output )) return 2 ;
// Info de recheche manquant
if ( ! isset ( $info [ $tdl ][ 'regexp' ]) || empty ( $info [ $tdl ][ 'regexp' ])) return 4 ;
// Libre
if ( preg_match ( " / " . $info [ $tdl ][ 'regexp' ] . " /i " , $output )) return 1 ;
// Pris
return 0 ;
}
2015-07-20 10:17:28 +02:00
/*** Fonction PDF ***/
function getPdfInfo ( $f ){
$tabInfo = array ( 'file' => $f ,
'fileName' => basename ( $f ));
$handle = @ fopen ( $f , 'r' );
if ( $handle ) {
//echo '1'.EOL;
$i = $nbPages = $buffer = 0 ;
while ( ! feof ( $handle )) {
$prev_buffer = $buffer ;
$buffer = fgets ( $handle , 4096 );
if ( $i == 0 && preg_match ( " /^ \ %PDF \ -(.*) \ s/U " , $buffer , $matches ))
$tabInfo [ 'version' ] = $matches [ 1 ];
elseif ( preg_match ( " /Type \ s* \ /Page[^s]/ " , $buffer ) )
++ $nbPages ;
$i ++ ;
}
//echo '2'.EOL;
if ( preg_match ( " / \ % \ %EOF $ / " , $prev_buffer ) || preg_match ( " / \ % \ %EOF/ " , $prev_buffer ) || preg_match ( " / \ % \ %EOF/ " , $buffer )) {
//echo '3'.EOL;
$tabInfo [ 'pdfEOF' ] = true ;
} else {
/* echo '4 prev:' . $prev_buffer . EOL ;
echo '4 last:' . $buffer . EOL ; */
$tabInfo [ 'debugBuffer' ] = $prev_buffer ;
return false ;
}
fclose ( $handle );
} else {
//echo '5'.EOL;
return false ;
}
$tabInfo [ 'pdfSize' ] = filesize ( $f );
$tabInfo [ 'nbPages' ] = 0 + $nbPages ;
$tabInfo [ 'nbCar' ] = strlen ( pdf2text ( $f ));
//$tabInfo['debugBuffer']=$prev_buffer;
return $tabInfo ;
}
/** @ link : http :// webcheatsheet . com / php / reading_clean_text_from_pdf . php
*/
function decodeAsciiHex ( $input ) {
$output = '' ;
$isOdd = true ;
$isComment = false ;
for ( $i = 0 , $codeHigh = - 1 ; $i < strlen ( $input ) && $input [ $i ] != '>' ; $i ++ ) {
$c = $input [ $i ];
if ( $isComment ) {
if ( $c == '\r' || $c == '\n' )
$isComment = false ;
continue ;
}
switch ( $c ) {
case '\0' : case '\t' : case '\r' : case '\f' : case '\n' : case ' ' : break ;
case '%' :
$isComment = true ;
break ;
default :
$code = hexdec ( $c );
if ( $code === 0 && $c != '0' ) return '' ;
if ( $isOdd ) $codeHigh = $code ;
else $output .= chr ( $codeHigh * 16 + $code );
$isOdd = ! $isOdd ;
break ;
}
}
if ( $input [ $i ] != '>' ) return '' ;
if ( $isOdd ) $output .= chr ( $codeHigh * 16 );
return $output ;
}
function decodeAscii85 ( $input ) {
$output = '' ;
$isComment = false ;
$ords = array ();
for ( $i = 0 , $state = 0 ; $i < strlen ( $input ) && $input [ $i ] != '~' ; $i ++ ) {
$c = $input [ $i ];
if ( $isComment ) {
if ( $c == '\r' || $c == '\n' )
$isComment = false ;
continue ;
}
if ( $c == '\0' || $c == '\t' || $c == '\r' || $c == '\f' || $c == '\n' || $c == ' ' ) continue ;
if ( $c == '%' ) {
$isComment = true ;
continue ;
}
if ( $c == 'z' && $state === 0 ) {
$output .= str_repeat ( chr ( 0 ), 4 );
continue ;
}
if ( $c < '!' || $c > 'u' ) return '' ;
$code = ord ( $input [ $i ]) & 0xff ;
$ords [ $state ++ ] = $code - ord ( '!' );
if ( $state == 5 ) {
$state = 0 ;
for ( $sum = 0 , $j = 0 ; $j < 5 ; $j ++ )
$sum = $sum * 85 + $ords [ $j ];
for ( $j = 3 ; $j >= 0 ; $j -- )
$output .= chr ( $sum >> ( $j * 8 ));
}
}
if ( $state === 1 ) return '' ;
elseif ( $state > 1 ) {
for ( $i = 0 , $sum = 0 ; $i < $state ; $i ++ )
$sum += ( $ords [ $i ] + ( $i == $state - 1 )) * pow ( 85 , 4 - $i );
for ( $i = 0 ; $i < $state - 1 ; $i ++ )
$ouput .= chr ( $sum >> (( 3 - $i ) * 8 ));
}
return $output ;
}
function decodeFlate ( $input ) {
return @ gzuncompress ( $input );
}
function getObjectOptions ( $object ) {
$options = array ();
if ( preg_match ( " #<<(.*)>>#ismU " , $object , $options )) {
$options = explode ( " / " , $options [ 1 ]);
@ array_shift ( $options );
$o = array ();
for ( $j = 0 ; $j < @ count ( $options ); $j ++ ) {
$options [ $j ] = preg_replace ( " # \ s+# " , " " , trim ( $options [ $j ]));
if ( strpos ( $options [ $j ], " " ) !== false ) {
$parts = explode ( " " , $options [ $j ]);
$o [ $parts [ 0 ]] = $parts [ 1 ];
} else
$o [ $options [ $j ]] = true ;
}
$options = $o ;
unset ( $o );
}
return $options ;
}
function getDecodedStream ( $stream , $options ) {
$data = '' ;
if ( empty ( $options [ " Filter " ]))
$data = $stream ;
else {
$length = ! empty ( $options [ " Length " ]) ? $options [ " Length " ] : strlen ( $stream );
$_stream = substr ( $stream , 0 , $length );
foreach ( $options as $key => $value ) {
if ( $key == " ASCIIHexDecode " ) $_stream = decodeAsciiHex ( $_stream );
if ( $key == " ASCII85Decode " ) $_stream = decodeAscii85 ( $_stream );
if ( $key == " FlateDecode " ) $_stream = decodeFlate ( $_stream );
}
$data = $_stream ;
}
return $data ;
}
function getDirtyTexts ( & $texts , $textContainers ) {
for ( $j = 0 ; $j < count ( $textContainers ); $j ++ ) {
if ( preg_match_all ( " # \ [(.*) \ ] \ s*TJ#ismU " , $textContainers [ $j ], $parts ))
$texts = array_merge ( $texts , @ $parts [ 1 ]);
elseif ( preg_match_all ( " #Td \ s*( \ (.* \ )) \ s*Tj#ismU " , $textContainers [ $j ], $parts ))
$texts = array_merge ( $texts , @ $parts [ 1 ]);
}
}
function getCharTransformations ( & $transformations , $stream ) {
preg_match_all ( " #([0-9]+) \ s+beginbfchar(.*)endbfchar#ismU " , $stream , $chars , PREG_SET_ORDER );
preg_match_all ( " #([0-9]+) \ s+beginbfrange(.*)endbfrange#ismU " , $stream , $ranges , PREG_SET_ORDER );
for ( $j = 0 ; $j < count ( $chars ); $j ++ ) {
$count = $chars [ $j ][ 1 ];
$current = explode ( " \n " , trim ( $chars [ $j ][ 2 ]));
for ( $k = 0 ; $k < $count && $k < count ( $current ); $k ++ ) {
if ( preg_match ( " #<([0-9a-f] { 2,4})> \ s+<([0-9a-f] { 4,512})>#is " , trim ( $current [ $k ]), $map ))
$transformations [ str_pad ( $map [ 1 ], 4 , " 0 " )] = $map [ 2 ];
}
}
for ( $j = 0 ; $j < count ( $ranges ); $j ++ ) {
$count = $ranges [ $j ][ 1 ];
$current = explode ( " \n " , trim ( $ranges [ $j ][ 2 ]));
for ( $k = 0 ; $k < $count && $k < count ( $current ); $k ++ ) {
if ( preg_match ( " #<([0-9a-f] { 4})> \ s+<([0-9a-f] { 4})> \ s+<([0-9a-f] { 4})>#is " , trim ( $current [ $k ]), $map )) {
$from = hexdec ( $map [ 1 ]);
$to = hexdec ( $map [ 2 ]);
$_from = hexdec ( $map [ 3 ]);
for ( $m = $from , $n = 0 ; $m <= $to ; $m ++ , $n ++ )
$transformations [ sprintf ( " %04X " , $m )] = sprintf ( " %04X " , $_from + $n );
} elseif ( preg_match ( " #<([0-9a-f] { 4})> \ s+<([0-9a-f] { 4})> \ s+ \ [(.*) \ ]#ismU " , trim ( $current [ $k ]), $map )) {
$from = hexdec ( $map [ 1 ]);
$to = hexdec ( $map [ 2 ]);
$parts = preg_split ( " # \ s+# " , trim ( $map [ 3 ]));
for ( $m = $from , $n = 0 ; $m <= $to && $n < count ( $parts ); $m ++ , $n ++ )
$transformations [ sprintf ( " %04X " , $m )] = sprintf ( " %04X " , hexdec ( $parts [ $n ]));
}
}
}
}
function getTextUsingTransformations ( $texts , $transformations ) {
$document = '' ;
for ( $i = 0 ; $i < count ( $texts ); $i ++ ) {
$isHex = false ;
$isPlain = false ;
$hex = '' ;
$plain = '' ;
for ( $j = 0 ; $j < strlen ( $texts [ $i ]); $j ++ ) {
$c = $texts [ $i ][ $j ];
switch ( $c ) {
case " < " :
$hex = " " ;
$isHex = true ;
break ;
case " > " :
$hexs = str_split ( $hex , 4 );
for ( $k = 0 ; $k < count ( $hexs ); $k ++ ) {
$chex = str_pad ( $hexs [ $k ], 4 , " 0 " );
if ( isset ( $transformations [ $chex ]))
$chex = $transformations [ $chex ];
$document .= html_entity_decode ( " &#x " . $chex . " ; " );
}
$isHex = false ;
break ;
case " ( " :
$plain = " " ;
$isPlain = true ;
break ;
case " ) " :
$document .= $plain ;
$isPlain = false ;
break ;
case " \\ " :
$c2 = $texts [ $i ][ $j + 1 ];
if ( in_array ( $c2 , array ( " \\ " , " ( " , " ) " ))) $plain .= $c2 ;
elseif ( $c2 == " n " ) $plain .= '\n' ;
elseif ( $c2 == " r " ) $plain .= '\r' ;
elseif ( $c2 == " t " ) $plain .= '\t' ;
elseif ( $c2 == " b " ) $plain .= '\b' ;
elseif ( $c2 == " f " ) $plain .= '\f' ;
elseif ( $c2 >= '0' && $c2 <= '9' ) {
$oct = preg_replace ( " #[^0-9]# " , " " , substr ( $texts [ $i ], $j + 1 , 3 ));
$j += strlen ( $oct ) - 1 ;
$plain .= html_entity_decode ( " &# " . octdec ( $oct ) . " ; " );
}
$j ++ ;
break ;
default :
if ( $isHex ) $hex .= $c ;
if ( $isPlain ) $plain .= $c ;
break ;
}
}
$document .= " \n " ;
}
return $document ;
}
function pdf2text ( $filename ) {
$infile = @ file_get_contents ( $filename , FILE_BINARY );
if ( empty ( $infile )) return '' ;
$transformations = array ();
$texts = array ();
preg_match_all ( " #obj(.*)endobj#ismU " , $infile , $objects );
$objects = @ $objects [ 1 ];
for ( $i = 0 ; $i < count ( $objects ); $i ++ ) {
$currentObject = $objects [ $i ];
if ( preg_match ( " #stream(.*)endstream#ismU " , $currentObject , $stream )) {
$stream = ltrim ( $stream [ 1 ]);
$options = getObjectOptions ( $currentObject );
if ( ! ( empty ( $options [ " Length1 " ]) && empty ( $options [ " Type " ]) && empty ( $options [ " Subtype " ]))) continue ;
$data = getDecodedStream ( $stream , $options );
if ( strlen ( $data )) {
if ( preg_match_all ( " #BT(.*)ET#ismU " , $data , $textContainers )) {
$textContainers = @ $textContainers [ 1 ];
getDirtyTexts ( $texts , $textContainers );
} else
getCharTransformations ( $transformations , $data );
}
}
}
return getTextUsingTransformations ( $texts , $transformations );
}
2012-10-16 10:08:03 +02:00
?>