Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Incredible slow parsing #385

Open
djklim87 opened this issue Apr 30, 2024 · 6 comments
Open

Incredible slow parsing #385

djklim87 opened this issue Apr 30, 2024 · 6 comments

Comments

@djklim87
Copy link

<?php
include('vendor/autoload.php');
use PHPSQLParser\PHPSQLParser;
$parser = new PHPSQLParser();
$query = file_get_contents('/tmp/aaa/insert.sql');
$t = microtime(true);
$parsed = $parser->parse($query);
var_dump(microtime(true) - $t);
{
    ....
    "require": {
        "greenlion/php-sql-parser": "v4.6.0"
    }
}

Result will

double(58.586335897446)

It's too slow for so a
insert.sql.zip
small file.

@djklim87
Copy link
Author

If you modify lexer like this, it will work 20 times faster double(1.3940100669861)

<?php
/**
 * PHPSQLLexer.php
 *
 * This file contains the lexer, which splits and recombines parts of the
 * SQL statement just before parsing.
 *
 * PHP version 5
 *
 * LICENSE:
 * Copyright (c) 2010-2014 Justin Swanhart and André Rothe
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. The name of the author may not be used to endorse or promote products
 *    derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 * @author    André Rothe <andre.rothe@phosco.info>
 * @copyright 2010-2014 Justin Swanhart and André Rothe
 * @license   http://www.debian.org/misc/bsd.license  BSD License (3 Clause)
 * @version   SVN: $Id$
 *
 */

namespace PHPSQLParser\lexer;
use PHPSQLParser\exceptions\InvalidParameterException;

/**
 * This class splits the SQL string into little parts, which the parser can
 * use to build the result array.
 *
 * @author  André Rothe <andre.rothe@phosco.info>
 * @license http://www.debian.org/misc/bsd.license  BSD License (3 Clause)
 *
 */
class PHPSQLLexer {

    protected $splitters;

    /**
     * Constructor.
     *
     * It initializes some fields.
     */
    public function __construct() {
        $this->splitters = new LexerSplitter();
    }

    /**
     * Ends the given string $haystack with the string $needle?
     *
     * @param string $haystack
     * @param string $needle
     *
     * @return boolean true, if the parameter $haystack ends with the character sequences $needle, false otherwise
     */
    protected function endsWith($haystack, $needle) {
        $length = strlen($needle);
        if ($length == 0) {
            return true;
        }
        return (substr($haystack, -$length) === $needle);
    }

    public function split($sql) {
        if (!is_string($sql)) {
            throw new InvalidParameterException($sql);
        }
        $tokens = preg_split($this->splitters->getSplittersRegexPattern(), $sql, 0, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);
        $tokens = $this->concatComments($tokens);
        $tokens = $this->concatEscapeSequences($tokens);
        $tokens = $this->balanceBackticks($tokens);
        $tokens = $this->concatColReferences($tokens);
        $tokens = $this->balanceParenthesis($tokens);
        $tokens = $this->concatUserDefinedVariables($tokens);
        $tokens = $this->concatScientificNotations($tokens);
        $tokens = $this->concatNegativeNumbers($tokens);
        return $tokens;
    }

    protected function concatNegativeNumbers($tokens) {

    	$i = 0;
    	$cnt = count($tokens);
    	$possibleSign = true;

    	while ($i < $cnt) {

    		if (!isset($tokens[$i])) {
    			$i++;
    			continue;
    		}

    		$token = $tokens[$i];

    		// a sign is also possible on the first position of the tokenlist
    		if ($possibleSign === true) {
				if ($token === '-' || $token === '+') {
					if (is_numeric($tokens[$i + 1])) {
						$tokens[$i + 1] = $token . $tokens[$i + 1];
						unset($tokens[$i]);
					}
				}
				$possibleSign = false;
				continue;
    		}

    		// TODO: we can have sign of a number after "(" and ",", are others possible?
    		if (substr($token, -1, 1) === "," || substr($token, -1, 1) === "(") {
    			$possibleSign = true;
    		}

    		$i++;
   		}

   		return array_values($tokens);
    }

    protected function concatScientificNotations($tokens) {

        $i = 0;
        $cnt = count($tokens);
        $scientific = false;

        while ($i < $cnt) {

            if (!isset($tokens[$i])) {
                $i++;
                continue;
            }

            $token = $tokens[$i];

            if ($scientific === true) {
                if ($token === '-' || $token === '+') {
                    $tokens[$i - 1] .= $tokens[$i];
                    $tokens[$i - 1] .= $tokens[$i + 1];
                    unset($tokens[$i]);
                    unset($tokens[$i + 1]);

                } elseif (is_numeric($token)) {
                    $tokens[$i - 1] .= $tokens[$i];
                    unset($tokens[$i]);
                }
                $scientific = false;
                continue;
            }

            if (strtoupper(substr($token, -1, 1)) === 'E') {
                $scientific = is_numeric(substr($token, 0, -1));
            }

            $i++;
        }

        return array_values($tokens);
    }

    protected function concatUserDefinedVariables($tokens) {
        $i = 0;
        $cnt = count($tokens);
        $userdef = false;

        while ($i < $cnt) {

            if (!isset($tokens[$i])) {
                $i++;
                continue;
            }

            $token = $tokens[$i];

            if ($userdef !== false) {
                $tokens[$userdef] .= $token;
                unset($tokens[$i]);
                if ($token !== "@") {
                    $userdef = false;
                }
            }

            if ($userdef === false && $token === "@") {
                $userdef = $i;
            }

            $i++;
        }

        return array_values($tokens);
    }

    protected function concatComments($tokens) {

        $i = 0;
        $cnt = count($tokens);
        $comment = false;
        $backTicks = [];
        $in_string = false;
        $inline = false;

        while ($i < $cnt) {

            if (!isset($tokens[$i])) {
                $i++;
                continue;
            }

            $token = $tokens[$i];

            /*
             * Check to see if we're inside a value (i.e. back ticks).
             * If so inline comments are not valid.
             */
            if ($comment === false && $this->isBacktick($token)) {
                if (!empty($backTicks)) {
                    $lastBacktick = array_pop($backTicks);
                    if ($lastBacktick != $token) {
                        $backTicks[] = $lastBacktick; // Re-add last back tick
                        $backTicks[] = $token;
                    }
                } else {
                    $backTicks[] = $token;
                }
            }

            if($comment === false && ($token == "\"" || $token == "'")) {
                $in_string = !$in_string;
            }
            if(!$in_string) {
                if ($comment !== false) {
                    if ($inline === true && ($token === "\n" || $token === "\r\n")) {
                        $comment = false;
                    } else {
                        unset($tokens[$i]);
                        $tokens[$comment] .= $token;
                    }
                    if ($inline === false && ($token === "*/")) {
                        $comment = false;
                    }
                }

                if (($comment === false) && ($token === "--") && empty($backTicks)) {
                    $comment = $i;
                    $inline = true;
                }

                if (($comment === false) && (substr($token, 0, 1) === "#") && empty($backTicks)) {
                    $comment = $i;
                    $inline = true;
                }

                if (($comment === false) && ($token === "/*")) {
                    $comment = $i;
                    $inline = false;
                }
            }

            $i++;
        }

        return array_values($tokens);
    }

    protected function isBacktick($token) {
        return ($token === "'" || $token === "\"" || $token === "`");
    }

    protected function balanceBackticks($tokens) {
        $unsetCount = 0;
        $fullLength = sizeof($tokens);
        foreach ($tokens as $k=> $token){

            if ($unsetCount>0){
                unset($tokens[$k]);
                $unsetCount--;
                continue;
            }
            if ($this->isBacktick($token)) {
                list($token, $unsetCount) = $this->balanceCharacter($tokens, $k, $token, $fullLength);
                $tokens[$k]=$token;
            }
        }

        return array_values($tokens);
    }

    // backticks are not balanced within one token, so we have
    // to re-combine some tokens
    protected function balanceCharacter($tokens, $startPosition, $char, $fullLength) {

        $shift = 0;
        $startPosition ++;

        $between[] = $char;
        for ($i = $startPosition; $i < $fullLength; $i++) {
            $token = $tokens[$i];
            $between[] = $token;
            $shift++;
            if ($token === $char) {
                break;
            }
        }


        $result = implode("", $between);

        return [$result, $shift];
    }

    /**
     * This function concats some tokens to a column reference.
     * There are two different cases:
     *
     * 1. If the current token ends with a dot, we will add the next token
     * 2. If the next token starts with a dot, we will add it to the previous token
     *
     */
    protected function concatColReferences($tokens) {

        $cnt = count($tokens);
        $i = 0;
        while ($i < $cnt) {

            if (!isset($tokens[$i])) {
                $i++;
                continue;
            }

            if ($tokens[$i][0] === ".") {

                // concat the previous tokens, till the token has been changed
                $k = $i - 1;
                $len = strlen($tokens[$i]);
                while (($k >= 0) && ($len == strlen($tokens[$i]))) {
                    if (!isset($tokens[$k])) { // FIXME: this can be wrong if we have schema . table . column
                        $k--;
                        continue;
                    }
                    $tokens[$i] = $tokens[$k] . $tokens[$i];
                    unset($tokens[$k]);
                    $k--;
                }
            }

            if ($this->endsWith($tokens[$i], '.') && !is_numeric($tokens[$i])) {

                // concat the next tokens, till the token has been changed
                $k = $i + 1;
                $len = strlen($tokens[$i]);
                while (($k < $cnt) && ($len == strlen($tokens[$i]))) {
                    if (!isset($tokens[$k])) {
                        $k++;
                        continue;
                    }
                    $tokens[$i] .= $tokens[$k];
                    unset($tokens[$k]);
                    $k++;
                }
            }

            $i++;
        }

        return array_values($tokens);
    }

    protected function concatEscapeSequences($tokens) {
        $tokenCount = count($tokens);
        $i = 0;
        while ($i < $tokenCount) {

            if ($this->endsWith($tokens[$i], "\\")) {
                $i++;
                if (isset($tokens[$i])) {
                    $tokens[$i - 1] .= $tokens[$i];
                    unset($tokens[$i]);
                }
            }
            $i++;
        }
        return array_values($tokens);
    }

    protected function balanceParenthesis($tokens) {
        $token_count = count($tokens);
        $i = 0;
        while ($i < $token_count) {
            if ($tokens[$i] !== '(') {
                $i++;
                continue;
            }
            $count = 1;
            for ($n = $i + 1; $n < $token_count; $n++) {
                $token = $tokens[$n];
                if ($token === '(') {
                    $count++;
                }
                if ($token === ')') {
                    $count--;
                }
                $tokens[$i] .= $token;
                unset($tokens[$n]);
                if ($count === 0) {
                    $n++;
                    break;
                }
            }
            $i = $n;
        }
        return array_values($tokens);
    }
}

?>

@joebordes
Copy link

Interesting.
@djklim87, Did you run the unit tests after the change?

This is the diff

diff --git a/vendor/greenlion/php-sql-parser/src/PHPSQLParser/lexer/PHPSQLLexer.php b/vendor/greenlion/php-sql-parser/src/PHPSQLParser/lexer/PHPSQLLexer.php
index 8faf5574c..b27bcd15a 100644
--- a/vendor/greenlion/php-sql-parser/src/PHPSQLParser/lexer/PHPSQLLexer.php
+++ b/vendor/greenlion/php-sql-parser/src/PHPSQLParser/lexer/PHPSQLLexer.php
@@ -283,51 +283,43 @@ class PHPSQLLexer {
     }
 
     protected function balanceBackticks($tokens) {
-        $i = 0;
-        $cnt = count($tokens);
-        while ($i < $cnt) {
+        $unsetCount = 0;
+        $fullLength = sizeof($tokens);
+        foreach ($tokens as $k=> $token){
 
-            if (!isset($tokens[$i])) {
-                $i++;
+            if ($unsetCount>0){
+                unset($tokens[$k]);
+                $unsetCount--;
                 continue;
             }
-
-            $token = $tokens[$i];
-
             if ($this->isBacktick($token)) {
-                $tokens = $this->balanceCharacter($tokens, $i, $token);
+                list($token, $unsetCount) = $this->balanceCharacter($tokens, $k, $token, $fullLength);
+                $tokens[$k]=$token;
             }
-
-            $i++;
         }
 
-        return $tokens;
+        return array_values($tokens);
     }
 
     // backticks are not balanced within one token, so we have
     // to re-combine some tokens
-    protected function balanceCharacter($tokens, $idx, $char) {
+    protected function balanceCharacter($tokens, $startPosition, $char, $fullLength) {
 
-        $token_count = count($tokens);
-        $i = $idx + 1;
-        while ($i < $token_count) {
-
-            if (!isset($tokens[$i])) {
-                $i++;
-                continue;
-            }
+        $shift = 0;
+        $startPosition ++;
 
+        $between[] = $char;
+        for ($i = $startPosition; $i < $fullLength; $i++) {
             $token = $tokens[$i];
-            $tokens[$idx] .= $token;
-            unset($tokens[$i]);
-
+            $between[] = $token;
+            $shift++;
             if ($token === $char) {
                 break;
             }
-
-            $i++;
         }
-        return array_values($tokens);
+        $result = implode("", $between);
+
+        return [$result, $shift];
     }
 
     /**

would be nice if someone could validate that. I will try to find some time to test it and share.

Thanks
HTH

@joebordes
Copy link

unit tests are passing

@joebordes
Copy link

I created a phpbench script

<?php
include 'vendor/autoload.php';
use PHPSQLParser\PHPSQLParser;

class ParseInsertBench {

	/**
	* @Revs(20)
	* @Iterations(5)
	*/
	public function benchparseInsert() {
		$parser = new PHPSQLParser();
		$query = file_get_contents('insert.sql');
		$parser->parse($query);
	}

	/**
	* @Revs(20)
	* @Iterations(5)
	*/
	public function benchparseQuery() {
		$parser = new PHPSQLParser();
		$query = file_get_contents('query.sql');
		$parser->parse($query);
	}
}

BEFORE

./phpbench.phar run testspeed.php
PHPBench (1.2.15) running benchmarks... #standwithukraine
with PHP version 8.2.15, xdebug ✔, opcache ❌

\ParseInsertBench

    benchparseInsert........................I4 - Mo2.583m (±0.51%)
    benchparseQuery.........................I4 - Mo9.289ms (±1.21%)

Subjects: 2, Assertions: 0, Failures: 0, Errors: 0

AFTER

./phpbench.phar run testspeed.php
PHPBench (1.2.15) running benchmarks... #standwithukraine
with PHP version 8.2.15, xdebug ✔, opcache ❌

\ParseInsertBench

    benchparseInsert........................I4 - Mo1.966s (±0.71%)
    benchparseQuery.........................I4 - Mo9.634ms (±4.71%)

Subjects: 2, Assertions: 0, Failures: 0, Errors: 0

The difference is significant on the insert SQL but seems worse on the query SQL.

HTH

@djklim87
Copy link
Author

djklim87 commented May 2, 2024

@joebordes can you attach to this issue content of query.sql?

@joebordes
Copy link

select DISTINCT vtiger_invoice.subject AS 'Invoice_Subject', (CASE WHEN vtiger_invoice.salesorderid NOT LIKE '' THEN (CASE WHEN vtiger_salesorderInvoice.subject NOT LIKE '' THEN vtiger_salesorderInvoice.subject ELSE '' END) ELSE '' END) AS Invoice_Sales_Order, vtiger_invoice.customerno AS 'Invoice_Customer_No', vtiger_invoice.exciseduty AS 'Invoice_Excise_Duty', vtiger_invoice.salescommission AS 'Invoice_Sales_Commission', (CASE WHEN vtiger_accountInvoice.accountname NOT LIKE '' THEN (CASE WHEN vtiger_accountInvoice.accountname NOT LIKE '' THEN vtiger_accountInvoice.accountname ELSE '' END) ELSE '' END) AS Invoice_Account_Name, vtiger_invoicebillads.bill_street AS 'Invoice_Billing_Address', vtiger_invoiceshipads.ship_street AS 'Invoice_Shipping_Address', (CASE WHEN vtiger_assets.product NOT LIKE '' THEN (CASE WHEN vtiger_productsRelAssets603.productname NOT LIKE '' THEN vtiger_productsRelAssets603.productname ELSE '' END) ELSE '' END) AS Assets_Product_Name, vtiger_assets.serialnumber AS 'Assets_Serial_Number', vtiger_contactsubdetails.assistant AS 'Contacts_Assistant', (CASE WHEN vtiger_contactdetails.reportsto NOT LIKE '' THEN (CASE WHEN CONCAT(vtiger_contactdetailsContacts.firstname,' ',vtiger_contactdetailsContacts.lastname) NOT LIKE '' THEN CONCAT(vtiger_contactdetailsContacts.firstname,' ',vtiger_contactdetailsContacts.lastname) ELSE '' END) ELSE '' END) AS Contacts_Reports_To, vtiger_contactsubdetails.assistantphone AS 'Contacts_Assistant_Phone', vtiger_crmentity.crmid AS "LBL_ACTION" from vtiger_invoice inner join vtiger_crmentity on vtiger_crmentity.crmid=vtiger_invoice.invoiceid left join vtiger_groups on vtiger_groups.groupid = vtiger_crmentity.smownerid left join vtiger_users on vtiger_users.id = vtiger_crmentity.smownerid inner join vtiger_invoicebillads on vtiger_invoice.invoiceid=vtiger_invoicebillads.invoicebilladdressid inner join vtiger_invoiceshipads on vtiger_invoice.invoiceid=vtiger_invoiceshipads.invoiceshipaddressid left join vtiger_salesorder as vtiger_salesorderInvoice on vtiger_salesorderInvoice.salesorderid=vtiger_invoice.salesorderid left join vtiger_account as vtiger_accountInvoice on vtiger_accountInvoice.accountid = vtiger_invoice.accountid left join vtiger_reptmptbl_16632c2b68258b2627337190 as vtiger_assets on vtiger_assets.invoiceid=vtiger_invoice.invoiceid left join vtiger_crmentity as vtiger_crmentityAssets on vtiger_crmentityAssets.crmid = vtiger_assets.assetsid AND vtiger_crmentityAssets.deleted=0 left join vtiger_crmentity as vtiger_crmentityRelAssets0 on vtiger_crmentityRelAssets0.crmid = vtiger_assets.product and vtiger_crmentityRelAssets0.deleted=0 left join vtiger_products as vtiger_productsRelAssets603 on vtiger_productsRelAssets603.productid = vtiger_crmentityRelAssets0.crmid left join vtiger_reptmptbl_16632c2b6839a95870061631 as vtiger_contactdetails on vtiger_invoice.contactid=vtiger_contactdetails.contactid left join vtiger_crmentity as vtiger_crmentityContacts on vtiger_crmentityContacts.crmid = vtiger_contactdetails.contactid AND vtiger_crmentityContacts.deleted=0 left join vtiger_contactdetails as vtiger_contactdetailsContacts on vtiger_contactdetailsContacts.contactid = vtiger_contactdetails.reportsto left join vtiger_contactsubdetails on vtiger_contactdetails.contactid = vtiger_contactsubdetails.contactsubscriptionid WHERE vtiger_invoice.invoiceid>0 AND vtiger_crmentity.deleted=0  limit 0, 40

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants