batch/framework/common/curl.php

<?

//include_once('/var/www/default/_includes/timer.php');

/** Parse une page Html et retourne son contenu dans un tableau :
 ** "code"   => Code réponse Serveur
 ** "header" => Headers du serveur
 ** "body"   => Page HTML
 **/
function parse_response($this_response) {


	// Split response into header and body sections
	list($response_headers, $response_body) = explode("\r\n\r\n", $this_response, 2);
	$response_header_lines = explode("\r\n", $response_headers);

	// First line of headers is the HTTP response code
	$http_response_line = array_shift($response_header_lines);
	if(preg_match('@^HTTP/[0-9]\.[0-9] ([0-9]{3})@',$http_response_line, $matches)) { $response_code = $matches[1]; }

	// put the rest of the headers in an array
	$response_header_array = array();
	$nbRMID=0;
	foreach($response_header_lines as $header_line)
	{
		list($header,$value) = explode(': ', $header_line, 2);

		if ($header=='Set-cookie' && substr($value,0,5)=='RMID=' && $nbRMID<5)//{
			$nbRMID++;
//			echo ("Je gicle le RMID n°$nbRMID\r\n");}
		else
			@$response_header_array[$header] .= $value."\n";
	}
	return array('code' => $response_code, 'header' => $response_header_array, 'body' => $response_body);
}

/** Récupère une page HTML en fonction des paramètres :
 ** $url				Url distante de la page à récupérer
 ** $strCookies Chaine de caractère contenant les cookies
 ** $postData		Tableau des données à passer en POST uniquement
 ** $referer		Referer à indiquer lors de l'appel de la page
 ** $debug			Activer le débogage (True/False)
 **
 ** ... et retourne son contenu dans un tableau :
 ** "code"   => Code réponse Serveur
 ** "header" => Headers du serveur
 ** "body"   => Page HTML
 **/
function getUrl($url, $strCookies='', $postData='', $referer='', $debug=false, $host='', $proxy='', $timeout=0, $nbRetry=0) {

	$ch = curl_init();

	if ($host=='')
		$this_header = array('Host: '. parse_url($url, PHP_URL_HOST));
	else
		$this_header = array('Host: '. $host);

	curl_setopt($ch, CURLOPT_URL, $url);
	curl_setopt($ch, CURLOPT_HEADER, 1);
	if ($proxy<>'')	curl_setopt($ch, CURLOPT_PROXY, $proxy);
	//curl_setopt($ch, CURLOPT_PROXYUSERPWD, 'username:password'); // Pas nécessaire en authentification NT

	if (((int)$timeout)<>0) {
		curl_setopt($ch, CURLOPT_TIMEOUT, (int)$timeout);
		curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, (int)$timeout);
	}

	curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
	//curl_setopt($ch, CURLOPT_BINARYTRANSFER, 1);	// New
	//curl_setopt($ch, CURLOPT_FRESH_CONNECT, 1);
	//curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
	//curl_setopt($ch, CURLOPT_MAXREDIRS, 1);
/*	curl_setopt($ch, CURLOPT_WRITEFUNCTION, receiveResponse);
function receiveResponse($curlHandle,$xmldata)
{
                        $this->responseString = $xmldata;
                        $this->responseXML .=  $this->responseString;
                        $this->length = strlen($xmldata);
                        $this->size += $this->length;
                        return $this->length;

}
*/

	if (preg_match('/^https/i',$url))
		curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);

	$user_agent = 'Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)';
	curl_setopt($ch, CURLOPT_USERAGENT, $user_agent);
	curl_setopt($ch, CURLOPT_REFERER, $referer);

	// Add each cookie that has been returned in the response
	// If cookies need to be added/deleted or value changed, then add code here
	if ($strCookies!='') {
		//die('"'.$strCookies.'"');
		//echo $strCookies."\r\n";
		$cookies = explode("\n", $strCookies);
		// Create the basic header
		foreach($cookies as $this_cookie) {
			if (trim($this_cookie)<>'')
				array_push($this_header, 'Cookie: '.$this_cookie);
		}
	}

	if ($postData!='') {
		if (is_array($postData))
		$post_data=$postData;

		$o="";
		foreach ($post_data as $k=>$v)
		{
			$o.= "$k=".utf8_encode($v)."&";
		}
		$post_data=substr($o,0,-1);

		curl_setopt($ch, CURLOPT_POST, 1);
		curl_setopt($ch, CURLOPT_POSTFIELDS, $post_data);
		//if in_array('',$this_header
		/*array_push($this_header, "Content-type: application/x-www-form-urlencoded");
		array_push($this_header, "Content-Length: 44");*/
	}

/*	curl_setopt($ch, CURLOPT_STDERR, '/var/www/log/curlerror.log');
	curl_setopt($ch, CURLOPT_VERBOSE, true);
 */
	curl_setopt($ch, CURLOPT_HTTPHEADER, $this_header);
	//print_r($this_header);

if ($nbRetry==0) {
	$page=curl_exec($ch);
	if($page === false) {
		if (curl_errno($ch) == 28) //TIMEOUT
        	$response=array('code' =>408, 'header' =>array('curl_errno'=>curl_errno($ch),'curl_error'=>curl_error($ch)), 'body' =>'Connexion impossible au site du partenaire/Timeout');
        else
            $response=array('code' =>400, 'header' =>array('curl_errno'=>curl_errno($ch),'curl_error'=>curl_error($ch)), 'body' =>'Erreur Curl : ' . curl_error($ch));
    } else
        $response = parse_response($page);
} else {
	$numTry=0;
	while($numTry<=$nbRetry) {
		$page=curl_exec($ch);
		if($page === false) {
			if (curl_errno($ch) == 28) //TIMEOUT
        		$response=array('code' =>408, 'header' =>array('curl_errno'=>curl_errno($ch),'curl_error'=>curl_error($ch)), 'body' =>'Connexion impossible au site du partenaire');
        	else
            	$response=array('code' =>400, 'header' =>array('curl_errno'=>curl_errno($ch),'curl_error'=>curl_error($ch)), 'body' =>'Erreur Curl : ' . curl_error($ch));
		} else {
        	$response = parse_response($page);
        	break;
		}
	}
}


    if ($debug){
		$url2=str_replace('http://', '', $url);
		$url2=str_replace('/', '_', $url2);
		$url2=str_replace('?', '(param)', $url2);
		$url2=str_replace('&', '(et)', $url2);


		$fp=fopen('/tmp/curl-'. date('Ymd-His') .'-'. microtime_float(true) .'-'. $url2 . '.html', 'a');
		fwrite($fp, $url."\r\n");
		fwrite($fp, $page);
		fclose($fp);
		//echo strip_tags(html_entity_decode($response['body']), '<td>');
	}
	//print_r(curl_getinfo($ch));
	curl_close($ch);
	return $response;
}

/** Recherche un texte dans une page HTML
 **
 **/
function getTextInHtml($pageHtml, $strToFind, $strDeb, $strEnd, $include_strDeb=false, $include_strEnd=false, $ltrim=true, $rtrim=true, &$fin, $nbOcc=1) {
	$tabRet=array();
	$deb=$nbOccTrouve=0;
	while( is_int(($deb=strpos($pageHtml,$strToFind,$fin))) ) {
		$deb++;
		$deb2 = strpos($pageHtml,$strDeb, $deb);
		$fin = strpos($pageHtml,$strEnd, $deb2);
		if (!$include_strDeb)
			$deb2+=strlen($strDeb);
		$s_temp = substr($pageHtml, $deb2, ($fin-$deb2));

		if ($ltrim)	$s_temp=ltrim($s_temp);
		if ($rtrim)	$s_temp=rtrim($s_temp);

		if ($nbOcc==1) return $s_temp;
		//echo $s_temp."\r\n";
		//$a_temp = explode('" class="basic">', $s_temp);
		$tabUrl[$nbOccTrouve]=$s_temp;
		$nbOccTrouve++;

		if ($nbOcc==$nbOccTrouve) {
			//	echo "j'ai trouvé le nb demandé, je sort\r\n";
				break;
		};
	}

	return $tabUrl;
	/*<span class="mongrasvert">
    <li>Le type de voie a été modifié<br>
    <li>L'orthographe du mot directeur a été modifiée<br>
    <li>Le code postal a été forcé à partir du département et de la localité<br>                        </span>
*/
}

function simpleWhois($domain, $tdl, $debug=false) {
		// Fonction de traitement
		// -----------------------------------------------
		// 0	=> Info, le nom est pris
		// 1	=> Info, le nom est libre
		// 2	=> Info, le nom est en pending
		// 3 	=> Avertissement, trop de requêtes
		// 4 	=> Erreur, il faut spécifier une chaine à rechercher (regexp)
		// 5 	=> Erreur, la requête retournée était vide
		// -----------------------------------------------

		// Informations spécifiques aux extensions
		$info = array(
			'fr' 	=> array(
				'host' 		=> 'whois.nic.fr',
				'regexp'	=> 'No[s]*entries[s]*found',
				'pending'	=> 'status[s]*:[s]*REDEMPTION'
			),
			'com' 	=> array(
				'host' 		=> 'whois.crsnic.net',
				'regexp'	=> 'No[s]*match[s]*for'
			),
			'net' 	=> array(
				'host' 		=> 'whois.crsnic.net',
				'regexp'	=> 'No[s]*match[s]*for'
			),
		);

		// Initialisation de la sortie
		$output		= '';
		// Initialisation de la requête
		$req		= fsockopen($info[$tdl]['host'], 43, $errno, $errstr, 5); // le 5 permet de stopper la requete si aucune réponse au bout de 5 secondes
		// Récupération de la requête
		if($req){
			fputs($req, $domain.'.'.$tdl."rn");
			while(!feof($req)) $output .= fgets($req, 4096);
			fclose($req);
		}else unset($req);
		// mode debug
		if($debug) return $output;
		// Sortie vide
		if(empty($output)) return 5;
		// Trop de requêtes
		if(preg_match("/(Too[s]+many[s]+requests|Your[s]+connection[s]+limit[s]+exceeded|daily[s]+whois[s]+limit[s]+exceeded|Maximum[s]+queries|WHOIS[s]+LIMIT[s]+EXCEEDED|referral[s]+host[s]+not[s]+responding|Excessive[s]+querying)/i", $output)) return 3;
		// Pending
		if(isset($info[$tdl]['pending']) && !empty($info[$tdl]['pending']) && preg_match("/".$info[$tdl]['pending']."/i", $output)) return 2;
		// Info de recheche manquant
		if(!isset($info[$tdl]['regexp']) || empty($info[$tdl]['regexp'])) return 4;
		// Libre
		if(preg_match("/".$info[$tdl]['regexp']."/i", $output)) return 1;
		// Pris
		return 0;
	}


/*** Fonction PDF ***/

	function getPdfInfo($f){
		$tabInfo=array(	'file'=>$f,
						'fileName'=>basename($f));
		$handle = @fopen($f, 'r');
		if ($handle) {
			//echo '1'.EOL;
			$i=$nbPages=$buffer=0;
			while (!feof($handle)) {
				$prev_buffer=$buffer;
				$buffer = fgets($handle, 4096);
				if ($i==0 && preg_match("/^\%PDF\-(.*)\s/U", $buffer, $matches))
				$tabInfo['version']=$matches[1];
				elseif (preg_match("/Type\s*\/Page[^s]/", $buffer) )
				++$nbPages;
				$i++;
			}
			//echo '2'.EOL;
			if (preg_match("/\%\%EOF$/", $prev_buffer) || preg_match("/\%\%EOF/", $prev_buffer) || preg_match("/\%\%EOF/", $buffer)) {
				//echo '3'.EOL;
				$tabInfo['pdfEOF']=true;
			} else {
				/*echo '4 prev:'.$prev_buffer.EOL;
				 echo '4 last:'.$buffer.EOL;*/
				$tabInfo['debugBuffer']=$prev_buffer;
				return false;
			}
			fclose($handle);
		} else {
			//echo '5'.EOL;
			return false;
		}

		$tabInfo['pdfSize']=filesize($f);
		$tabInfo['nbPages']=0+$nbPages;
		$tabInfo['nbCar']=strlen(pdf2text($f));
		//$tabInfo['debugBuffer']=$prev_buffer;
		return $tabInfo;
	}

/** @link : http://webcheatsheet.com/php/reading_clean_text_from_pdf.php
 */

	function decodeAsciiHex($input) {
		$output = '';

		$isOdd = true;
		$isComment = false;

		for($i = 0, $codeHigh = -1; $i < strlen($input) && $input[$i] != '>'; $i++) {
			$c = $input[$i];

			if($isComment) {
				if ($c == '\r' || $c == '\n')
				$isComment = false;
				continue;
			}

			switch($c) {
				case '\0': case '\t': case '\r': case '\f': case '\n': case ' ': break;
				case '%':
					$isComment = true;
					break;
				default:
					$code = hexdec($c);
					if($code === 0 && $c != '0')	return '';
					if($isOdd)						$codeHigh = $code;
					else							$output .= chr($codeHigh * 16 + $code);
					$isOdd = !$isOdd;
					break;
			}
		}

		if($input[$i] != '>')	return '';

		if($isOdd)	$output .= chr($codeHigh * 16);

		return $output;
	}

	function decodeAscii85($input) {
		$output = '';

		$isComment = false;
		$ords = array();

		for($i = 0, $state = 0; $i < strlen($input) && $input[$i] != '~'; $i++) {
			$c = $input[$i];

			if($isComment) {
				if ($c == '\r' || $c == '\n')
				$isComment = false;
				continue;
			}

			if ($c == '\0' || $c == '\t' || $c == '\r' || $c == '\f' || $c == '\n' || $c == ' ')		continue;
			if ($c == '%') {
				$isComment = true;
				continue;
			}
			if ($c == 'z' && $state === 0) {
				$output .= str_repeat(chr(0), 4);
				continue;
			}
			if ($c < '!' || $c > 'u')	return '';

			$code = ord($input[$i]) & 0xff;
			$ords[$state++] = $code - ord('!');

			if ($state == 5) {
				$state = 0;
				for ($sum = 0, $j = 0; $j < 5; $j++)
				$sum = $sum * 85 + $ords[$j];
				for ($j = 3; $j >= 0; $j--)
				$output .= chr($sum >> ($j * 8));
			}
		}

		if ($state === 1)	return '';
		elseif ($state > 1) {
			for ($i = 0, $sum = 0; $i < $state; $i++)
				$sum += ($ords[$i] + ($i == $state - 1)) * pow(85, 4 - $i);
				for ($i = 0; $i < $state - 1; $i++)
					$ouput .= chr($sum >> ((3 - $i) * 8));
		}

		return $output;
	}

	function decodeFlate($input) {
		return @gzuncompress($input);
	}

	function getObjectOptions($object) {
		$options = array();
		if (preg_match("#<<(.*)>>#ismU", $object, $options)) {
			$options = explode("/", $options[1]);
			@array_shift($options);

			$o = array();
			for ($j = 0; $j < @count($options); $j++) {
				$options[$j] = preg_replace("#\s+#", " ", trim($options[$j]));
				if (strpos($options[$j], " ") !== false) {
					$parts = explode(" ", $options[$j]);
					$o[$parts[0]] = $parts[1];
				} else
					$o[$options[$j]] = true;
			}
			$options = $o;
			unset($o);
		}
		return $options;
	}

	function getDecodedStream($stream, $options) {
		$data = '';
		if (empty($options["Filter"]))
			$data = $stream;
		else {
			$length = !empty($options["Length"]) ? $options["Length"] : strlen($stream);
			$_stream = substr($stream, 0, $length);
			foreach ($options as $key => $value) {
				if ($key == "ASCIIHexDecode")	$_stream = decodeAsciiHex($_stream);
				if ($key == "ASCII85Decode")	$_stream = decodeAscii85($_stream);
				if ($key == "FlateDecode")		$_stream = decodeFlate($_stream);
			}
	        $data = $_stream;
		}
		return $data;
	}

	function getDirtyTexts(&$texts, $textContainers) {
		for ($j = 0; $j < count($textContainers); $j++) {
			if (preg_match_all("#\[(.*)\]\s*TJ#ismU", $textContainers[$j], $parts))
				$texts = array_merge($texts, @$parts[1]);
			elseif(preg_match_all("#Td\s*(\(.*\))\s*Tj#ismU", $textContainers[$j], $parts))
				$texts = array_merge($texts, @$parts[1]);
		}
	}

	function getCharTransformations(&$transformations, $stream) {
		preg_match_all("#([0-9]+)\s+beginbfchar(.*)endbfchar#ismU", $stream, $chars, PREG_SET_ORDER);
		preg_match_all("#([0-9]+)\s+beginbfrange(.*)endbfrange#ismU", $stream, $ranges, PREG_SET_ORDER);

	    for ($j = 0; $j < count($chars); $j++) {
			$count = $chars[$j][1];
	        $current = explode("\n", trim($chars[$j][2]));
			for ($k = 0; $k < $count && $k < count($current); $k++) {
				if (preg_match("#<([0-9a-f]{2,4})>\s+<([0-9a-f]{4,512})>#is", trim($current[$k]), $map))
					$transformations[str_pad($map[1], 4, "0")] = $map[2];
			}
		}
		for ($j = 0; $j < count($ranges); $j++) {
			$count = $ranges[$j][1];
			$current = explode("\n", trim($ranges[$j][2]));
			for ($k = 0; $k < $count && $k < count($current); $k++) {
				if (preg_match("#<([0-9a-f]{4})>\s+<([0-9a-f]{4})>\s+<([0-9a-f]{4})>#is", trim($current[$k]), $map)) {
					$from = hexdec($map[1]);
					$to = hexdec($map[2]);
					$_from = hexdec($map[3]);
	                for ($m = $from, $n = 0; $m <= $to; $m++, $n++)
						$transformations[sprintf("%04X", $m)] = sprintf("%04X", $_from + $n);
				} elseif (preg_match("#<([0-9a-f]{4})>\s+<([0-9a-f]{4})>\s+\[(.*)\]#ismU", trim($current[$k]), $map)) {
					$from = hexdec($map[1]);
					$to = hexdec($map[2]);
					$parts = preg_split("#\s+#", trim($map[3]));
					for ($m = $from, $n = 0; $m <= $to && $n < count($parts); $m++, $n++)
						$transformations[sprintf("%04X", $m)] = sprintf("%04X", hexdec($parts[$n]));
				}
			}
		}
	}

	function getTextUsingTransformations($texts, $transformations) {
		$document = '';
		for ($i = 0; $i < count($texts); $i++) {
			$isHex = false;
			$isPlain = false;

			$hex = '';
	        $plain = '';
	        for ($j = 0; $j < strlen($texts[$i]); $j++) {
				$c = $texts[$i][$j];
				switch($c) {
					case "<":
						$hex = "";
						$isHex = true;
						break;
					case ">":
						$hexs = str_split($hex, 4);
						for ($k = 0; $k < count($hexs); $k++) {
							$chex = str_pad($hexs[$k], 4, "0");
							if (isset($transformations[$chex]))
								$chex = $transformations[$chex];
							$document .= html_entity_decode("&#x".$chex.";");
						}
						$isHex = false;
						break;
	                case "(":
						$plain = "";
						$isPlain = true;
						break;
					case ")":
						$document .= $plain;
						$isPlain = false;
						break;
					case "\\":
						$c2 = $texts[$i][$j + 1];
						if (in_array($c2, array("\\", "(", ")"))) $plain .= $c2;
						elseif ($c2 == "n") $plain .= '\n';
						elseif ($c2 == "r") $plain .= '\r';
						elseif ($c2 == "t") $plain .= '\t';
						elseif ($c2 == "b") $plain .= '\b';
						elseif ($c2 == "f") $plain .= '\f';
						elseif ($c2 >= '0' && $c2 <= '9') {
							$oct = preg_replace("#[^0-9]#", "", substr($texts[$i], $j + 1, 3));
							$j += strlen($oct) - 1;
							$plain .= html_entity_decode("&#".octdec($oct).";");
						}
						$j++;
						break;
					default:
						if ($isHex)		$hex .= $c;
	                    if ($isPlain)	$plain .= $c;
						break;
					}
				}
				$document .= "\n";
			}

	    return $document;
	}

	function pdf2text($filename) {
	    $infile = @file_get_contents($filename, FILE_BINARY);
		if (empty($infile))	return '';

		$transformations = array();
		$texts = array();

		preg_match_all("#obj(.*)endobj#ismU", $infile, $objects);
		$objects = @$objects[1];

		for ($i = 0; $i < count($objects); $i++) {
			$currentObject = $objects[$i];

			if (preg_match("#stream(.*)endstream#ismU", $currentObject, $stream)) {
				$stream = ltrim($stream[1]);

				$options = getObjectOptions($currentObject);
				if (!(empty($options["Length1"]) && empty($options["Type"]) && empty($options["Subtype"])))	continue;

				$data = getDecodedStream($stream, $options);
				if (strlen($data)) {
					if (preg_match_all("#BT(.*)ET#ismU", $data, $textContainers)) {
						$textContainers = @$textContainers[1];
						getDirtyTexts($texts, $textContainers);
					} else
						getCharTransformations($transformations, $data);
	            }
			}
		}

		return getTextUsingTransformations($texts, $transformations);
	}
?>