-
Notifications
You must be signed in to change notification settings - Fork 156
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Incredible slow parsing #385
Comments
If you modify lexer like this, it will work 20 times faster <?php
/**
* PHPSQLLexer.php
*
* This file contains the lexer, which splits and recombines parts of the
* SQL statement just before parsing.
*
* PHP version 5
*
* LICENSE:
* Copyright (c) 2010-2014 Justin Swanhart and André Rothe
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* @author André Rothe <andre.rothe@phosco.info>
* @copyright 2010-2014 Justin Swanhart and André Rothe
* @license http://www.debian.org/misc/bsd.license BSD License (3 Clause)
* @version SVN: $Id$
*
*/
namespace PHPSQLParser\lexer;
use PHPSQLParser\exceptions\InvalidParameterException;
/**
* This class splits the SQL string into little parts, which the parser can
* use to build the result array.
*
* @author André Rothe <andre.rothe@phosco.info>
* @license http://www.debian.org/misc/bsd.license BSD License (3 Clause)
*
*/
class PHPSQLLexer {
protected $splitters;
/**
* Constructor.
*
* It initializes some fields.
*/
public function __construct() {
$this->splitters = new LexerSplitter();
}
/**
* Ends the given string $haystack with the string $needle?
*
* @param string $haystack
* @param string $needle
*
* @return boolean true, if the parameter $haystack ends with the character sequences $needle, false otherwise
*/
protected function endsWith($haystack, $needle) {
$length = strlen($needle);
if ($length == 0) {
return true;
}
return (substr($haystack, -$length) === $needle);
}
public function split($sql) {
if (!is_string($sql)) {
throw new InvalidParameterException($sql);
}
$tokens = preg_split($this->splitters->getSplittersRegexPattern(), $sql, 0, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);
$tokens = $this->concatComments($tokens);
$tokens = $this->concatEscapeSequences($tokens);
$tokens = $this->balanceBackticks($tokens);
$tokens = $this->concatColReferences($tokens);
$tokens = $this->balanceParenthesis($tokens);
$tokens = $this->concatUserDefinedVariables($tokens);
$tokens = $this->concatScientificNotations($tokens);
$tokens = $this->concatNegativeNumbers($tokens);
return $tokens;
}
protected function concatNegativeNumbers($tokens) {
$i = 0;
$cnt = count($tokens);
$possibleSign = true;
while ($i < $cnt) {
if (!isset($tokens[$i])) {
$i++;
continue;
}
$token = $tokens[$i];
// a sign is also possible on the first position of the tokenlist
if ($possibleSign === true) {
if ($token === '-' || $token === '+') {
if (is_numeric($tokens[$i + 1])) {
$tokens[$i + 1] = $token . $tokens[$i + 1];
unset($tokens[$i]);
}
}
$possibleSign = false;
continue;
}
// TODO: we can have sign of a number after "(" and ",", are others possible?
if (substr($token, -1, 1) === "," || substr($token, -1, 1) === "(") {
$possibleSign = true;
}
$i++;
}
return array_values($tokens);
}
protected function concatScientificNotations($tokens) {
$i = 0;
$cnt = count($tokens);
$scientific = false;
while ($i < $cnt) {
if (!isset($tokens[$i])) {
$i++;
continue;
}
$token = $tokens[$i];
if ($scientific === true) {
if ($token === '-' || $token === '+') {
$tokens[$i - 1] .= $tokens[$i];
$tokens[$i - 1] .= $tokens[$i + 1];
unset($tokens[$i]);
unset($tokens[$i + 1]);
} elseif (is_numeric($token)) {
$tokens[$i - 1] .= $tokens[$i];
unset($tokens[$i]);
}
$scientific = false;
continue;
}
if (strtoupper(substr($token, -1, 1)) === 'E') {
$scientific = is_numeric(substr($token, 0, -1));
}
$i++;
}
return array_values($tokens);
}
protected function concatUserDefinedVariables($tokens) {
$i = 0;
$cnt = count($tokens);
$userdef = false;
while ($i < $cnt) {
if (!isset($tokens[$i])) {
$i++;
continue;
}
$token = $tokens[$i];
if ($userdef !== false) {
$tokens[$userdef] .= $token;
unset($tokens[$i]);
if ($token !== "@") {
$userdef = false;
}
}
if ($userdef === false && $token === "@") {
$userdef = $i;
}
$i++;
}
return array_values($tokens);
}
protected function concatComments($tokens) {
$i = 0;
$cnt = count($tokens);
$comment = false;
$backTicks = [];
$in_string = false;
$inline = false;
while ($i < $cnt) {
if (!isset($tokens[$i])) {
$i++;
continue;
}
$token = $tokens[$i];
/*
* Check to see if we're inside a value (i.e. back ticks).
* If so inline comments are not valid.
*/
if ($comment === false && $this->isBacktick($token)) {
if (!empty($backTicks)) {
$lastBacktick = array_pop($backTicks);
if ($lastBacktick != $token) {
$backTicks[] = $lastBacktick; // Re-add last back tick
$backTicks[] = $token;
}
} else {
$backTicks[] = $token;
}
}
if($comment === false && ($token == "\"" || $token == "'")) {
$in_string = !$in_string;
}
if(!$in_string) {
if ($comment !== false) {
if ($inline === true && ($token === "\n" || $token === "\r\n")) {
$comment = false;
} else {
unset($tokens[$i]);
$tokens[$comment] .= $token;
}
if ($inline === false && ($token === "*/")) {
$comment = false;
}
}
if (($comment === false) && ($token === "--") && empty($backTicks)) {
$comment = $i;
$inline = true;
}
if (($comment === false) && (substr($token, 0, 1) === "#") && empty($backTicks)) {
$comment = $i;
$inline = true;
}
if (($comment === false) && ($token === "/*")) {
$comment = $i;
$inline = false;
}
}
$i++;
}
return array_values($tokens);
}
protected function isBacktick($token) {
return ($token === "'" || $token === "\"" || $token === "`");
}
protected function balanceBackticks($tokens) {
$unsetCount = 0;
$fullLength = sizeof($tokens);
foreach ($tokens as $k=> $token){
if ($unsetCount>0){
unset($tokens[$k]);
$unsetCount--;
continue;
}
if ($this->isBacktick($token)) {
list($token, $unsetCount) = $this->balanceCharacter($tokens, $k, $token, $fullLength);
$tokens[$k]=$token;
}
}
return array_values($tokens);
}
// backticks are not balanced within one token, so we have
// to re-combine some tokens
protected function balanceCharacter($tokens, $startPosition, $char, $fullLength) {
$shift = 0;
$startPosition ++;
$between[] = $char;
for ($i = $startPosition; $i < $fullLength; $i++) {
$token = $tokens[$i];
$between[] = $token;
$shift++;
if ($token === $char) {
break;
}
}
$result = implode("", $between);
return [$result, $shift];
}
/**
* This function concats some tokens to a column reference.
* There are two different cases:
*
* 1. If the current token ends with a dot, we will add the next token
* 2. If the next token starts with a dot, we will add it to the previous token
*
*/
protected function concatColReferences($tokens) {
$cnt = count($tokens);
$i = 0;
while ($i < $cnt) {
if (!isset($tokens[$i])) {
$i++;
continue;
}
if ($tokens[$i][0] === ".") {
// concat the previous tokens, till the token has been changed
$k = $i - 1;
$len = strlen($tokens[$i]);
while (($k >= 0) && ($len == strlen($tokens[$i]))) {
if (!isset($tokens[$k])) { // FIXME: this can be wrong if we have schema . table . column
$k--;
continue;
}
$tokens[$i] = $tokens[$k] . $tokens[$i];
unset($tokens[$k]);
$k--;
}
}
if ($this->endsWith($tokens[$i], '.') && !is_numeric($tokens[$i])) {
// concat the next tokens, till the token has been changed
$k = $i + 1;
$len = strlen($tokens[$i]);
while (($k < $cnt) && ($len == strlen($tokens[$i]))) {
if (!isset($tokens[$k])) {
$k++;
continue;
}
$tokens[$i] .= $tokens[$k];
unset($tokens[$k]);
$k++;
}
}
$i++;
}
return array_values($tokens);
}
protected function concatEscapeSequences($tokens) {
$tokenCount = count($tokens);
$i = 0;
while ($i < $tokenCount) {
if ($this->endsWith($tokens[$i], "\\")) {
$i++;
if (isset($tokens[$i])) {
$tokens[$i - 1] .= $tokens[$i];
unset($tokens[$i]);
}
}
$i++;
}
return array_values($tokens);
}
protected function balanceParenthesis($tokens) {
$token_count = count($tokens);
$i = 0;
while ($i < $token_count) {
if ($tokens[$i] !== '(') {
$i++;
continue;
}
$count = 1;
for ($n = $i + 1; $n < $token_count; $n++) {
$token = $tokens[$n];
if ($token === '(') {
$count++;
}
if ($token === ')') {
$count--;
}
$tokens[$i] .= $token;
unset($tokens[$n]);
if ($count === 0) {
$n++;
break;
}
}
$i = $n;
}
return array_values($tokens);
}
}
?> |
Interesting. This is the diff diff --git a/vendor/greenlion/php-sql-parser/src/PHPSQLParser/lexer/PHPSQLLexer.php b/vendor/greenlion/php-sql-parser/src/PHPSQLParser/lexer/PHPSQLLexer.php
index 8faf5574c..b27bcd15a 100644
--- a/vendor/greenlion/php-sql-parser/src/PHPSQLParser/lexer/PHPSQLLexer.php
+++ b/vendor/greenlion/php-sql-parser/src/PHPSQLParser/lexer/PHPSQLLexer.php
@@ -283,51 +283,43 @@ class PHPSQLLexer {
}
protected function balanceBackticks($tokens) {
- $i = 0;
- $cnt = count($tokens);
- while ($i < $cnt) {
+ $unsetCount = 0;
+ $fullLength = sizeof($tokens);
+ foreach ($tokens as $k=> $token){
- if (!isset($tokens[$i])) {
- $i++;
+ if ($unsetCount>0){
+ unset($tokens[$k]);
+ $unsetCount--;
continue;
}
-
- $token = $tokens[$i];
-
if ($this->isBacktick($token)) {
- $tokens = $this->balanceCharacter($tokens, $i, $token);
+ list($token, $unsetCount) = $this->balanceCharacter($tokens, $k, $token, $fullLength);
+ $tokens[$k]=$token;
}
-
- $i++;
}
- return $tokens;
+ return array_values($tokens);
}
// backticks are not balanced within one token, so we have
// to re-combine some tokens
- protected function balanceCharacter($tokens, $idx, $char) {
+ protected function balanceCharacter($tokens, $startPosition, $char, $fullLength) {
- $token_count = count($tokens);
- $i = $idx + 1;
- while ($i < $token_count) {
-
- if (!isset($tokens[$i])) {
- $i++;
- continue;
- }
+ $shift = 0;
+ $startPosition ++;
+ $between[] = $char;
+ for ($i = $startPosition; $i < $fullLength; $i++) {
$token = $tokens[$i];
- $tokens[$idx] .= $token;
- unset($tokens[$i]);
-
+ $between[] = $token;
+ $shift++;
if ($token === $char) {
break;
}
-
- $i++;
}
- return array_values($tokens);
+ $result = implode("", $between);
+
+ return [$result, $shift];
}
/** would be nice if someone could validate that. I will try to find some time to test it and share. Thanks |
unit tests are passing |
I created a phpbench script <?php
include 'vendor/autoload.php';
use PHPSQLParser\PHPSQLParser;
class ParseInsertBench {
/**
* @Revs(20)
* @Iterations(5)
*/
public function benchparseInsert() {
$parser = new PHPSQLParser();
$query = file_get_contents('insert.sql');
$parser->parse($query);
}
/**
* @Revs(20)
* @Iterations(5)
*/
public function benchparseQuery() {
$parser = new PHPSQLParser();
$query = file_get_contents('query.sql');
$parser->parse($query);
}
} BEFORE ./phpbench.phar run testspeed.php
PHPBench (1.2.15) running benchmarks... #standwithukraine
with PHP version 8.2.15, xdebug ✔, opcache ❌
\ParseInsertBench
benchparseInsert........................I4 - Mo2.583m (±0.51%)
benchparseQuery.........................I4 - Mo9.289ms (±1.21%)
Subjects: 2, Assertions: 0, Failures: 0, Errors: 0 AFTER ./phpbench.phar run testspeed.php
PHPBench (1.2.15) running benchmarks... #standwithukraine
with PHP version 8.2.15, xdebug ✔, opcache ❌
\ParseInsertBench
benchparseInsert........................I4 - Mo1.966s (±0.71%)
benchparseQuery.........................I4 - Mo9.634ms (±4.71%)
Subjects: 2, Assertions: 0, Failures: 0, Errors: 0 The difference is significant on the insert SQL but seems worse on the query SQL. HTH |
@joebordes can you attach to this issue content of |
select DISTINCT vtiger_invoice.subject AS 'Invoice_Subject', (CASE WHEN vtiger_invoice.salesorderid NOT LIKE '' THEN (CASE WHEN vtiger_salesorderInvoice.subject NOT LIKE '' THEN vtiger_salesorderInvoice.subject ELSE '' END) ELSE '' END) AS Invoice_Sales_Order, vtiger_invoice.customerno AS 'Invoice_Customer_No', vtiger_invoice.exciseduty AS 'Invoice_Excise_Duty', vtiger_invoice.salescommission AS 'Invoice_Sales_Commission', (CASE WHEN vtiger_accountInvoice.accountname NOT LIKE '' THEN (CASE WHEN vtiger_accountInvoice.accountname NOT LIKE '' THEN vtiger_accountInvoice.accountname ELSE '' END) ELSE '' END) AS Invoice_Account_Name, vtiger_invoicebillads.bill_street AS 'Invoice_Billing_Address', vtiger_invoiceshipads.ship_street AS 'Invoice_Shipping_Address', (CASE WHEN vtiger_assets.product NOT LIKE '' THEN (CASE WHEN vtiger_productsRelAssets603.productname NOT LIKE '' THEN vtiger_productsRelAssets603.productname ELSE '' END) ELSE '' END) AS Assets_Product_Name, vtiger_assets.serialnumber AS 'Assets_Serial_Number', vtiger_contactsubdetails.assistant AS 'Contacts_Assistant', (CASE WHEN vtiger_contactdetails.reportsto NOT LIKE '' THEN (CASE WHEN CONCAT(vtiger_contactdetailsContacts.firstname,' ',vtiger_contactdetailsContacts.lastname) NOT LIKE '' THEN CONCAT(vtiger_contactdetailsContacts.firstname,' ',vtiger_contactdetailsContacts.lastname) ELSE '' END) ELSE '' END) AS Contacts_Reports_To, vtiger_contactsubdetails.assistantphone AS 'Contacts_Assistant_Phone', vtiger_crmentity.crmid AS "LBL_ACTION" from vtiger_invoice inner join vtiger_crmentity on vtiger_crmentity.crmid=vtiger_invoice.invoiceid left join vtiger_groups on vtiger_groups.groupid = vtiger_crmentity.smownerid left join vtiger_users on vtiger_users.id = vtiger_crmentity.smownerid inner join vtiger_invoicebillads on vtiger_invoice.invoiceid=vtiger_invoicebillads.invoicebilladdressid inner join vtiger_invoiceshipads on vtiger_invoice.invoiceid=vtiger_invoiceshipads.invoiceshipaddressid left join vtiger_salesorder as vtiger_salesorderInvoice on vtiger_salesorderInvoice.salesorderid=vtiger_invoice.salesorderid left join vtiger_account as vtiger_accountInvoice on vtiger_accountInvoice.accountid = vtiger_invoice.accountid left join vtiger_reptmptbl_16632c2b68258b2627337190 as vtiger_assets on vtiger_assets.invoiceid=vtiger_invoice.invoiceid left join vtiger_crmentity as vtiger_crmentityAssets on vtiger_crmentityAssets.crmid = vtiger_assets.assetsid AND vtiger_crmentityAssets.deleted=0 left join vtiger_crmentity as vtiger_crmentityRelAssets0 on vtiger_crmentityRelAssets0.crmid = vtiger_assets.product and vtiger_crmentityRelAssets0.deleted=0 left join vtiger_products as vtiger_productsRelAssets603 on vtiger_productsRelAssets603.productid = vtiger_crmentityRelAssets0.crmid left join vtiger_reptmptbl_16632c2b6839a95870061631 as vtiger_contactdetails on vtiger_invoice.contactid=vtiger_contactdetails.contactid left join vtiger_crmentity as vtiger_crmentityContacts on vtiger_crmentityContacts.crmid = vtiger_contactdetails.contactid AND vtiger_crmentityContacts.deleted=0 left join vtiger_contactdetails as vtiger_contactdetailsContacts on vtiger_contactdetailsContacts.contactid = vtiger_contactdetails.reportsto left join vtiger_contactsubdetails on vtiger_contactdetails.contactid = vtiger_contactsubdetails.contactsubscriptionid WHERE vtiger_invoice.invoiceid>0 AND vtiger_crmentity.deleted=0 limit 0, 40 |
Result will
It's too slow for so a
insert.sql.zip
small file.
The text was updated successfully, but these errors were encountered: