roykin/tools/parser_sql/lexer/PHPSQLLexer.php
Thibault UBUNTU 3d7f60a05e push site
2016-03-03 10:33:17 +01:00

363 lines
10 KiB
PHP
Executable File

<?php
/**
* PHPSQLLexer.php
*
* This file contains the lexer, which splits and recombines parts of the
* SQL statement just before parsing.
*
* PHP version 5
*
* LICENSE:
* Copyright (c) 2010-2014 Justin Swanhart and André Rothe
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* @author André Rothe <andre.rothe@phosco.info>
* @copyright 2010-2014 Justin Swanhart and André Rothe
* @license http://www.debian.org/misc/bsd.license BSD License (3 Clause)
* @version SVN: $Id: PHPSQLLexer.php 842 2013-12-30 08:57:53Z phosco@gmx.de $
*
*/
require_once dirname(__FILE__) . '/LexerSplitter.php';
require_once dirname(__FILE__) . '/../exceptions/InvalidParameterException.php';
/**
* This class splits the SQL string into little parts, which the parser can
* use to build the result array.
*
* @author André Rothe <andre.rothe@phosco.info>
* @license http://www.debian.org/misc/bsd.license BSD License (3 Clause)
*
*/
class PHPSQLLexer {
protected $splitters;
/**
* Constructor.
*
* It initializes some fields.
*/
public function __construct() {
$this->splitters = new LexerSplitter();
}
/**
* Ends the given string $haystack with the string $needle?
*
* @param string $haystack
* @param string $needle
*
* @return boolean true, if the parameter $haystack ends with the character sequences $needle, false otherwise
*/
protected function endsWith($haystack, $needle) {
$length = strlen($needle);
if ($length == 0) {
return true;
}
return (substr($haystack, -$length) === $needle);
}
public function split($sql) {
if (!is_string($sql)) {
throw new InvalidParameterException($sql);
}
$tokens = array();
$token = "";
$splitLen = $this->splitters->getMaxLengthOfSplitter();
$found = false;
$len = strlen($sql);
$pos = 0;
while ($pos < $len) {
for ($i = $splitLen; $i > 0; $i--) {
$substr = substr($sql, $pos, $i);
if ($this->splitters->isSplitter($substr)) {
if ($token !== "") {
$tokens[] = $token;
}
$tokens[] = $substr;
$pos += $i;
$token = "";
continue 2;
}
}
$token .= $sql[$pos];
$pos++;
}
if ($token !== "") {
$tokens[] = $token;
}
$tokens = $this->concatEscapeSequences($tokens);
$tokens = $this->balanceBackticks($tokens);
$tokens = $this->concatColReferences($tokens);
$tokens = $this->balanceParenthesis($tokens);
$tokens = $this->concatComments($tokens);
$tokens = $this->concatUserDefinedVariables($tokens);
return $tokens;
}
protected function concatUserDefinedVariables($tokens) {
$i = 0;
$cnt = count($tokens);
$userdef = false;
while ($i < $cnt) {
if (!isset($tokens[$i])) {
$i++;
continue;
}
$token = $tokens[$i];
if ($userdef !== false) {
$tokens[$userdef] .= $token;
unset($tokens[$i]);
if ($token !== "@") {
$userdef = false;
}
}
if ($userdef === false && $token === "@") {
$userdef = $i;
}
$i++;
}
return array_values($tokens);
}
protected function concatComments($tokens) {
$i = 0;
$cnt = count($tokens);
$comment = false;
while ($i < $cnt) {
if (!isset($tokens[$i])) {
$i++;
continue;
}
$token = $tokens[$i];
if ($comment !== false) {
if ($inline === true && ($token === "\n" || $token === "\r\n")) {
$comment = false;
} else {
unset($tokens[$i]);
$tokens[$comment] .= $token;
}
if ($inline === false && ($token === "*/")) {
$comment = false;
}
}
if (($comment === false) && ($token === "--")) {
$comment = $i;
$inline = true;
}
if (($comment === false) && ($token === "/*")) {
$comment = $i;
$inline = false;
}
$i++;
}
return array_values($tokens);
}
protected function isBacktick($token) {
return ($token === "'" || $token === "\"" || $token === "`");
}
protected function balanceBackticks($tokens) {
$i = 0;
$cnt = count($tokens);
while ($i < $cnt) {
if (!isset($tokens[$i])) {
$i++;
continue;
}
$token = $tokens[$i];
if ($this->isBacktick($token)) {
$tokens = $this->balanceCharacter($tokens, $i, $token);
}
$i++;
}
return $tokens;
}
// backticks are not balanced within one token, so we have
// to re-combine some tokens
protected function balanceCharacter($tokens, $idx, $char) {
$token_count = count($tokens);
$i = $idx + 1;
while ($i < $token_count) {
if (!isset($tokens[$i])) {
$i++;
continue;
}
$token = $tokens[$i];
$tokens[$idx] .= $token;
unset($tokens[$i]);
if ($token === $char) {
break;
}
$i++;
}
return array_values($tokens);
}
/**
* This function concats some tokens to a column reference.
* There are two different cases:
*
* 1. If the current token ends with a dot, we will add the next token
* 2. If the next token starts with a dot, we will add it to the previous token
*
*/
protected function concatColReferences($tokens) {
$cnt = count($tokens);
$i = 0;
while ($i < $cnt) {
if (!isset($tokens[$i])) {
$i++;
continue;
}
if ($tokens[$i][0] === ".") {
// concat the previous tokens, till the token has been changed
$k = $i - 1;
$len = strlen($tokens[$i]);
while (($k >= 0) && ($len == strlen($tokens[$i]))) {
if (!isset($tokens[$k])) { // FIXME: this can be wrong if we have schema . table . column
$k--;
continue;
}
$tokens[$i] = $tokens[$k] . $tokens[$i];
unset($tokens[$k]);
$k--;
}
}
if ($this->endsWith($tokens[$i], '.') && !is_numeric($tokens[$i])) {
// concat the next tokens, till the token has been changed
$k = $i + 1;
$len = strlen($tokens[$i]);
while (($k < $cnt) && ($len == strlen($tokens[$i]))) {
if (!isset($tokens[$k])) {
$k++;
continue;
}
$tokens[$i] .= $tokens[$k];
unset($tokens[$k]);
$k++;
}
}
$i++;
}
return array_values($tokens);
}
protected function concatEscapeSequences($tokens) {
$tokenCount = count($tokens);
$i = 0;
while ($i < $tokenCount) {
if ($this->endsWith($tokens[$i], "\\")) {
$i++;
if (isset($tokens[$i])) {
$tokens[$i - 1] .= $tokens[$i];
unset($tokens[$i]);
}
}
$i++;
}
return array_values($tokens);
}
protected function balanceParenthesis($tokens) {
$token_count = count($tokens);
$i = 0;
while ($i < $token_count) {
if ($tokens[$i] !== '(') {
$i++;
continue;
}
$count = 1;
for ($n = $i + 1; $n < $token_count; $n++) {
$token = $tokens[$n];
if ($token === '(') {
$count++;
}
if ($token === ')') {
$count--;
}
$tokens[$i] .= $token;
unset($tokens[$n]);
if ($count === 0) {
$n++;
break;
}
}
$i = $n;
}
return array_values($tokens);
}
}
?>