extranet/library/Zend/Pdf/FileParser.php

486 lines
16 KiB
PHP
Raw Normal View History

2010-11-18 13:46:34 +00:00
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Pdf
* @subpackage FileParser
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @version $Id: FileParser.php 20096 2010-01-06 02:05:09Z bkarwin $
*/
/**
* Abstract utility class for parsing binary files.
*
* Provides a library of methods to quickly navigate and extract various data
* types (signed and unsigned integers, floating- and fixed-point numbers,
* strings, etc.) from the file.
*
* File access is managed via a {@link Zend_Pdf_FileParserDataSource} object.
* This allows the same parser code to work with many different data sources:
* in-memory objects, filesystem files, etc.
*
* @package Zend_Pdf
* @subpackage FileParser
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
abstract class Zend_Pdf_FileParser
{
/**** Class Constants ****/
/**
* Little-endian byte order (0x04 0x03 0x02 0x01).
*/
const BYTE_ORDER_LITTLE_ENDIAN = 0;
/**
* Big-endian byte order (0x01 0x02 0x03 0x04).
*/
const BYTE_ORDER_BIG_ENDIAN = 1;
/**** Instance Variables ****/
/**
* Flag indicating that the file has passed a cursory validation check.
* @var boolean
*/
protected $_isScreened = false;
/**
* Flag indicating that the file has been sucessfully parsed.
* @var boolean
*/
protected $_isParsed = false;
/**
* Object representing the data source to be parsed.
* @var Zend_Pdf_FileParserDataSource
*/
protected $_dataSource = null;
/**** Public Interface ****/
/* Abstract Methods */
/**
* Performs a cursory check to verify that the binary file is in the expected
* format. Intended to quickly weed out obviously bogus files.
*
* Must set $this->_isScreened to true if successful.
*
* @throws Zend_Pdf_Exception
*/
abstract public function screen();
/**
* Reads and parses the complete binary file.
*
* Must set $this->_isParsed to true if successful.
*
* @throws Zend_Pdf_Exception
*/
abstract public function parse();
/* Object Lifecycle */
/**
* Object constructor.
*
* Verifies that the data source has been properly initialized.
*
* @param Zend_Pdf_FileParserDataSource $dataSource
* @throws Zend_Pdf_Exception
*/
public function __construct(Zend_Pdf_FileParserDataSource $dataSource)
{
if ($dataSource->getSize() == 0) {
require_once 'Zend/Pdf/Exception.php';
throw new Zend_Pdf_Exception('The data source has not been properly initialized',
Zend_Pdf_Exception::BAD_DATA_SOURCE);
}
$this->_dataSource = $dataSource;
}
/**
* Object destructor.
*
* Discards the data source object.
*/
public function __destruct()
{
$this->_dataSource = null;
}
/* Accessors */
/**
* Returns true if the file has passed a cursory validation check.
*
* @return boolean
*/
public function isScreened()
{
return $this->_isScreened;
}
/**
* Returns true if the file has been successfully parsed.
*
* @return boolean
*/
public function isParsed()
{
return $this->_isParsed;
}
/**
* Returns the data source object representing the file being parsed.
*
* @return Zend_Pdf_FileParserDataSource
*/
public function getDataSource()
{
return $this->_dataSource;
}
/* Primitive Methods */
/**
* Convenience wrapper for the data source object's moveToOffset() method.
*
* @param integer $offset Destination byte offset.
* @throws Zend_Pdf_Exception
*/
public function moveToOffset($offset)
{
$this->_dataSource->moveToOffset($offset);
}
public function getOffset() {
return $this->_dataSource->getOffset();
}
public function getSize() {
return $this->_dataSource->getSize();
}
/**
* Convenience wrapper for the data source object's readBytes() method.
*
* @param integer $byteCount Number of bytes to read.
* @return string
* @throws Zend_Pdf_Exception
*/
public function readBytes($byteCount)
{
return $this->_dataSource->readBytes($byteCount);
}
/**
* Convenience wrapper for the data source object's skipBytes() method.
*
* @param integer $byteCount Number of bytes to skip.
* @throws Zend_Pdf_Exception
*/
public function skipBytes($byteCount)
{
$this->_dataSource->skipBytes($byteCount);
}
/* Parser Methods */
/**
* Reads the signed integer value from the binary file at the current byte
* offset.
*
* Advances the offset by the number of bytes read. Throws an exception if
* an error occurs.
*
* @param integer $size Size of integer in bytes: 1-4
* @param integer $byteOrder (optional) Big- or little-endian byte order.
* Use the BYTE_ORDER_ constants defined in {@link Zend_Pdf_FileParser}.
* If omitted, uses big-endian.
* @return integer
* @throws Zend_Pdf_Exception
*/
public function readInt($size, $byteOrder = Zend_Pdf_FileParser::BYTE_ORDER_BIG_ENDIAN)
{
if (($size < 1) || ($size > 4)) {
require_once 'Zend/Pdf/Exception.php';
throw new Zend_Pdf_Exception("Invalid signed integer size: $size",
Zend_Pdf_Exception::INVALID_INTEGER_SIZE);
}
$bytes = $this->_dataSource->readBytes($size);
/* unpack() will not work for this method because it always works in
* the host byte order for signed integers. It also does not allow for
* variable integer sizes.
*/
if ($byteOrder == Zend_Pdf_FileParser::BYTE_ORDER_BIG_ENDIAN) {
$number = ord($bytes[0]);
if (($number & 0x80) == 0x80) {
/* This number is negative. Extract the positive equivalent.
*/
$number = (~ $number) & 0xff;
for ($i = 1; $i < $size; $i++) {
$number = ($number << 8) | ((~ ord($bytes[$i])) & 0xff);
}
/* Now turn this back into a negative number by taking the
* two's complement (we didn't add one above so won't
* subtract it below). This works reliably on both 32- and
* 64-bit systems.
*/
$number = ~$number;
} else {
for ($i = 1; $i < $size; $i++) {
$number = ($number << 8) | ord($bytes[$i]);
}
}
} else if ($byteOrder == Zend_Pdf_FileParser::BYTE_ORDER_LITTLE_ENDIAN) {
$number = ord($bytes[$size - 1]);
if (($number & 0x80) == 0x80) {
/* Negative number. See discussion above.
*/
$number = 0;
for ($i = --$size; $i >= 0; $i--) {
$number |= ((~ ord($bytes[$i])) & 0xff) << ($i * 8);
}
$number = ~$number;
} else {
$number = 0;
for ($i = --$size; $i >= 0; $i--) {
$number |= ord($bytes[$i]) << ($i * 8);
}
}
} else {
require_once 'Zend/Pdf/Exception.php';
throw new Zend_Pdf_Exception("Invalid byte order: $byteOrder",
Zend_Pdf_Exception::INVALID_BYTE_ORDER);
}
return $number;
}
/**
* Reads the unsigned integer value from the binary file at the current byte
* offset.
*
* Advances the offset by the number of bytes read. Throws an exception if
* an error occurs.
*
* NOTE: If you ask for a 4-byte unsigned integer on a 32-bit machine, the
* resulting value WILL BE SIGNED because PHP uses signed integers internally
* for everything. To guarantee portability, be sure to use bitwise operators
* operators on large unsigned integers!
*
* @param integer $size Size of integer in bytes: 1-4
* @param integer $byteOrder (optional) Big- or little-endian byte order.
* Use the BYTE_ORDER_ constants defined in {@link Zend_Pdf_FileParser}.
* If omitted, uses big-endian.
* @return integer
* @throws Zend_Pdf_Exception
*/
public function readUInt($size, $byteOrder = Zend_Pdf_FileParser::BYTE_ORDER_BIG_ENDIAN)
{
if (($size < 1) || ($size > 4)) {
require_once 'Zend/Pdf/Exception.php';
throw new Zend_Pdf_Exception("Invalid unsigned integer size: $size",
Zend_Pdf_Exception::INVALID_INTEGER_SIZE);
}
$bytes = $this->_dataSource->readBytes($size);
/* unpack() is a bit heavyweight for this simple conversion. Just
* work the bytes directly.
*/
if ($byteOrder == Zend_Pdf_FileParser::BYTE_ORDER_BIG_ENDIAN) {
$number = ord($bytes[0]);
for ($i = 1; $i < $size; $i++) {
$number = ($number << 8) | ord($bytes[$i]);
}
} else if ($byteOrder == Zend_Pdf_FileParser::BYTE_ORDER_LITTLE_ENDIAN) {
$number = 0;
for ($i = --$size; $i >= 0; $i--) {
$number |= ord($bytes[$i]) << ($i * 8);
}
} else {
require_once 'Zend/Pdf/Exception.php';
throw new Zend_Pdf_Exception("Invalid byte order: $byteOrder",
Zend_Pdf_Exception::INVALID_BYTE_ORDER);
}
return $number;
}
/**
* Returns true if the specified bit is set in the integer bitfield.
*
* @param integer $bit Bit number to test (i.e. - 0-31)
* @param integer $bitField
* @return boolean
*/
public function isBitSet($bit, $bitField)
{
$bitMask = 1 << $bit;
$isSet = (($bitField & $bitMask) == $bitMask);
return $isSet;
}
/**
* Reads the signed fixed-point number from the binary file at the current
* byte offset.
*
* Common fixed-point sizes are 2.14 and 16.16.
*
* Advances the offset by the number of bytes read. Throws an exception if
* an error occurs.
*
* @param integer $mantissaBits Number of bits in the mantissa
* @param integer $fractionBits Number of bits in the fraction
* @param integer $byteOrder (optional) Big- or little-endian byte order.
* Use the BYTE_ORDER_ constants defined in {@link Zend_Pdf_FileParser}.
* If omitted, uses big-endian.
* @return float
* @throws Zend_Pdf_Exception
*/
public function readFixed($mantissaBits, $fractionBits,
$byteOrder = Zend_Pdf_FileParser::BYTE_ORDER_BIG_ENDIAN)
{
$bitsToRead = $mantissaBits + $fractionBits;
if (($bitsToRead % 8) !== 0) {
require_once 'Zend/Pdf/Exception.php';
throw new Zend_Pdf_Exception('Fixed-point numbers are whole bytes',
Zend_Pdf_Exception::BAD_FIXED_POINT_SIZE);
}
$number = $this->readInt(($bitsToRead >> 3), $byteOrder) / (1 << $fractionBits);
return $number;
}
/**
* Reads the Unicode UTF-16-encoded string from the binary file at the
* current byte offset.
*
* The byte order of the UTF-16 string must be specified. You must also
* supply the desired resulting character set.
*
* Advances the offset by the number of bytes read. Throws an exception if
* an error occurs.
*
* @todo Consider changing $byteCount to a character count. They are not
* always equivalent (in the case of surrogates).
* @todo Make $byteOrder optional if there is a byte-order mark (BOM) in the
* string being extracted.
*
* @param integer $byteCount Number of bytes (characters * 2) to return.
* @param integer $byteOrder (optional) Big- or little-endian byte order.
* Use the BYTE_ORDER_ constants defined in {@link Zend_Pdf_FileParser}.
* If omitted, uses big-endian.
* @param string $characterSet (optional) Desired resulting character set.
* You may use any character set supported by {@link iconv()}. If omitted,
* uses 'current locale'.
* @return string
* @throws Zend_Pdf_Exception
*/
public function readStringUTF16($byteCount,
$byteOrder = Zend_Pdf_FileParser::BYTE_ORDER_BIG_ENDIAN,
$characterSet = '')
{
if ($byteCount == 0) {
return '';
}
$bytes = $this->_dataSource->readBytes($byteCount);
if ($byteOrder == Zend_Pdf_FileParser::BYTE_ORDER_BIG_ENDIAN) {
if ($characterSet == 'UTF-16BE') {
return $bytes;
}
return iconv('UTF-16BE', $characterSet, $bytes);
} else if ($byteOrder == Zend_Pdf_FileParser::BYTE_ORDER_LITTLE_ENDIAN) {
if ($characterSet == 'UTF-16LE') {
return $bytes;
}
return iconv('UTF-16LE', $characterSet, $bytes);
} else {
require_once 'Zend/Pdf/Exception.php';
throw new Zend_Pdf_Exception("Invalid byte order: $byteOrder",
Zend_Pdf_Exception::INVALID_BYTE_ORDER);
}
}
/**
* Reads the Mac Roman-encoded string from the binary file at the current
* byte offset.
*
* You must supply the desired resulting character set.
*
* Advances the offset by the number of bytes read. Throws an exception if
* an error occurs.
*
* @param integer $byteCount Number of bytes (characters) to return.
* @param string $characterSet (optional) Desired resulting character set.
* You may use any character set supported by {@link iconv()}. If omitted,
* uses 'current locale'.
* @return string
* @throws Zend_Pdf_Exception
*/
public function readStringMacRoman($byteCount, $characterSet = '')
{
if ($byteCount == 0) {
return '';
}
$bytes = $this->_dataSource->readBytes($byteCount);
if ($characterSet == 'MacRoman') {
return $bytes;
}
return iconv('MacRoman', $characterSet, $bytes);
}
/**
* Reads the Pascal string from the binary file at the current byte offset.
*
* The length of the Pascal string is determined by reading the length bytes
* which preceed the character data. You must supply the desired resulting
* character set.
*
* Advances the offset by the number of bytes read. Throws an exception if
* an error occurs.
*
* @param string $characterSet (optional) Desired resulting character set.
* You may use any character set supported by {@link iconv()}. If omitted,
* uses 'current locale'.
* @param integer $lengthBytes (optional) Number of bytes that make up the
* length. Default is 1.
* @return string
* @throws Zend_Pdf_Exception
*/
public function readStringPascal($characterSet = '', $lengthBytes = 1)
{
$byteCount = $this->readUInt($lengthBytes);
if ($byteCount == 0) {
return '';
}
$bytes = $this->_dataSource->readBytes($byteCount);
if ($characterSet == 'ASCII') {
return $bytes;
}
return iconv('ASCII', $characterSet, $bytes);
}
}