From 54fb59b7141a0511761a56ac7dc1c720b5610b6a Mon Sep 17 00:00:00 2001 From: iphydf Date: Sun, 20 Mar 2022 22:39:43 +0000 Subject: [PATCH] perf: Use ByteString for lexer; decode to Text immediately. This makes the end-to-end parsing about 2x faster. We can now parse all of toxcore in 0.45 seconds on my machine, making it parse around 3.4MiB/s (including TreeParser and CommentParser). The lexer consumes 0.11s out of those 0.45s, running at around 9.8MiB/s. --- BUILD.bazel | 5 ++++- cimple.cabal | 2 +- src/Language/Cimple/IO.hs | 30 ++++++++++++-------------- src/Language/Cimple/Lexer.x | 25 +++++++++++----------- src/Language/Cimple/Parser.y | 39 ++++++++++++++++++---------------- tools/BUILD.bazel | 14 +++++++++++- tools/count-tokens.hs | 41 ++++++++++++++++++++++++++++++++++++ tools/dump-tokens.hs | 20 ++++++++---------- 8 files changed, 116 insertions(+), 60 deletions(-) create mode 100644 tools/count-tokens.hs diff --git a/BUILD.bazel b/BUILD.bazel index 51dee09..79476a0 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -25,6 +25,8 @@ haskell_library( hazel_library("aeson"), hazel_library("array"), hazel_library("base"), + hazel_library("bytestring"), + hazel_library("text"), ], ) @@ -77,6 +79,7 @@ haskell_library( hazel_library("base"), hazel_library("data-fix"), hazel_library("recursion-schemes"), + hazel_library("text"), hazel_library("transformers-compat"), ], ) @@ -139,7 +142,7 @@ haskell_library( ], ), src_strip_prefix = "src", - version = "0.0.16", + version = "0.0.17", visibility = ["//visibility:public"], deps = [ ":ast", diff --git a/cimple.cabal b/cimple.cabal index 390476d..f191ba3 100644 --- a/cimple.cabal +++ b/cimple.cabal @@ -1,5 +1,5 @@ name: cimple -version: 0.0.16 +version: 0.0.17 synopsis: Simple C-like programming language homepage: https://toktok.github.io/ license: GPL-3 diff --git a/src/Language/Cimple/IO.hs b/src/Language/Cimple/IO.hs index daea8f7..07fc664 100644 --- a/src/Language/Cimple/IO.hs +++ b/src/Language/Cimple/IO.hs @@ -10,11 +10,10 @@ module Language.Cimple.IO import Control.Monad ((>=>)) import qualified Control.Monad.Parallel as P import Control.Monad.State.Strict (State, evalState, get, put) -import qualified Data.ByteString as BS +import qualified Data.ByteString.Lazy as LBS import Data.Map.Strict (Map) import qualified Data.Map.Strict as Map import Data.Text (Text) -import qualified Data.Text as Text import qualified Data.Text.Encoding as Text import Language.Cimple.Ast (Node) import Language.Cimple.Lexer (Lexeme, runAlex) @@ -27,37 +26,36 @@ import qualified Language.Cimple.Program as Program import Language.Cimple.TranslationUnit (TranslationUnit) import qualified Language.Cimple.TreeParser as TreeParser -type StringNode = Node (Lexeme String) type TextNode = Node (Lexeme Text) -toTextAst :: [StringNode] -> [TextNode] -toTextAst stringAst = - evalState (mapAst cacheActions stringAst) Map.empty +cacheText :: [TextNode] -> [TextNode] +cacheText textAst = + evalState (mapAst cacheActions textAst) Map.empty where - cacheActions :: TextActions (State (Map String Text)) String Text + cacheActions :: TextActions (State (Map Text Text)) Text Text cacheActions = textActions $ \s -> do m <- get case Map.lookup s m of Nothing -> do - let text = Text.pack s - put $ Map.insert s text m - return text + put $ Map.insert s s m + return s Just text -> return text parseText :: Text -> Either String [TextNode] -parseText contents = - toTextAst <$> runAlex (Text.unpack contents) Parser.parseTranslationUnit +parseText = fmap cacheText . flip runAlex Parser.parseTranslationUnit . LBS.fromStrict . Text.encodeUtf8 + +parseBytes :: LBS.ByteString -> Either String [TextNode] +parseBytes = flip runAlex Parser.parseTranslationUnit -parseTextPedantic :: Text -> Either String [TextNode] -parseTextPedantic = - parseText >=> ParseResult.toEither . TreeParser.parseTranslationUnit +parseBytesPedantic :: LBS.ByteString -> Either String [TextNode] +parseBytesPedantic = parseBytes >=> ParseResult.toEither . TreeParser.parseTranslationUnit parseFile :: FilePath -> IO (Either String (TranslationUnit Text)) parseFile source = - addSource . parseTextPedantic . Text.decodeUtf8 <$> BS.readFile source + addSource . parseBytesPedantic <$> LBS.readFile source where -- Add source filename to the error message, if any. addSource (Left err) = Left $ source <> err diff --git a/src/Language/Cimple/Lexer.x b/src/Language/Cimple/Lexer.x index 825a149..f496907 100644 --- a/src/Language/Cimple/Lexer.x +++ b/src/Language/Cimple/Lexer.x @@ -4,6 +4,7 @@ {-# LANGUAGE DeriveTraversable #-} {-# LANGUAGE StandaloneDeriving #-} {-# LANGUAGE StrictData #-} +{-# OPTIONS_GHC -Wno-unused-imports #-} module Language.Cimple.Lexer ( Alex , AlexPosn (..) @@ -15,16 +16,20 @@ module Language.Cimple.Lexer , lexemePosn , lexemeText , lexemeLine - , mkL , runAlex ) where import Data.Aeson (FromJSON, ToJSON) +import qualified Data.ByteString as BS +import qualified Data.ByteString.Lazy as LBS +import Data.Text (Text) +import qualified Data.Text as Text +import qualified Data.Text.Encoding as Text import GHC.Generics (Generic) import Language.Cimple.Tokens (LexemeClass (..)) } -%wrapper "monad" +%wrapper "monad-bytestring" tokens :- @@ -302,8 +307,9 @@ data Lexeme text = L AlexPosn LexemeClass text instance FromJSON text => FromJSON (Lexeme text) instance ToJSON text => ToJSON (Lexeme text) -mkL :: Applicative m => LexemeClass -> AlexInput -> Int -> m (Lexeme String) -mkL c (p, _, _, str) len = pure $ L p c (take len str) +mkL :: LexemeClass -> AlexInput -> Int64 -> Alex (Lexeme Text) +mkL c (p, _, str, _) len = pure $ L p c (piece str) + where piece = Text.decodeUtf8 . LBS.toStrict . LBS.take len lexemePosn :: Lexeme text -> AlexPosn lexemePosn (L p _ _) = p @@ -317,15 +323,10 @@ lexemeText (L _ _ s) = s lexemeLine :: Lexeme text -> Int lexemeLine (L (AlexPn _ l _) _ _) = l -start :: Int -> AlexInput -> Int -> Alex (Lexeme String) -start code _ _ = do - alexSetStartCode code - alexMonadScan +alexEOF :: Alex (Lexeme Text) +alexEOF = return (L (AlexPn 0 0 0) Eof Text.empty) -alexEOF :: Alex (Lexeme String) -alexEOF = return (L (AlexPn 0 0 0) Eof "") - -alexScanTokens :: String -> Either String [Lexeme String] +alexScanTokens :: LBS.ByteString -> Either String [Lexeme Text] alexScanTokens str = runAlex str $ loop [] where diff --git a/src/Language/Cimple/Parser.y b/src/Language/Cimple/Parser.y index 73e1b5c..d8f2769 100644 --- a/src/Language/Cimple/Parser.y +++ b/src/Language/Cimple/Parser.y @@ -1,9 +1,12 @@ { +{-# LANGUAGE OverloadedStrings #-} module Language.Cimple.Parser ( parseTranslationUnit ) where import Data.Fix (Fix (..)) +import Data.Text (Text) +import qualified Data.Text as Text import Language.Cimple.Ast (AssignOp (..), BinaryOp (..), CommentStyle (..), LiteralType (..), Node, NodeF (..), Scope (..), @@ -22,7 +25,7 @@ import Language.Cimple.Tokens (LexemeClass (..)) %errorhandlertype explist %lexer {lexwrap} {L _ Eof _} %monad {Alex} -%tokentype {StringLexeme} +%tokentype {Term} %token ID_CONST { L _ IdConst _ } ID_FUNC_TYPE { L _ IdFuncType _ } @@ -173,12 +176,12 @@ CopyrightDecl :: { NonTerm } CopyrightDecl : ' ' 'Copyright' CopyrightDates CopyrightOwner '\n' { Fix $ CopyrightDecl (fst $3) (snd $3) $4 } -CopyrightDates :: { (StringLexeme, Maybe StringLexeme) } +CopyrightDates :: { (Term, Maybe Term) } CopyrightDates : LIT_INTEGER { ($1, Nothing) } | LIT_INTEGER '-' LIT_INTEGER { ($1, Just $3) } -CopyrightOwner :: { [StringLexeme] } +CopyrightOwner :: { [Term] } CopyrightOwner : CMT_WORD CommentWords { $1 : reverse $2 } @@ -216,24 +219,24 @@ Comment | '/** @} */' { Fix $ CommentSectionEnd $1 } | Ignore { $1 } -CommentTokens :: { [StringLexeme] } +CommentTokens :: { [Term] } CommentTokens : CommentToken { [$1] } | CommentTokens CommentToken { $2 : $1 } -CommentToken :: { StringLexeme } +CommentToken :: { Term } CommentToken : CommentWord { $1 } | '\n' { $1 } | ' * ' { $1 } | ' ' { $1 } -CommentWords :: { [StringLexeme] } +CommentWords :: { [Term] } CommentWords : { [] } | CommentWords CommentWord { $2 : $1 } -CommentWord :: { StringLexeme } +CommentWord :: { Term } CommentWord : CMT_WORD { $1 } | CMT_COMMAND { $1 } @@ -265,7 +268,7 @@ Ignore :: { NonTerm } Ignore : IGN_START IgnoreBody IGN_END { Fix $ Comment Ignore $1 (reverse $2) $3 } -IgnoreBody :: { [StringLexeme] } +IgnoreBody :: { [Term] } IgnoreBody : IGN_BODY { [$1] } | IgnoreBody IGN_BODY { $2 : $1 } @@ -602,7 +605,7 @@ Enumerator | EnumeratorName '=' ConstExpr ',' { Fix $ Enumerator $1 (Just $3) } | Comment { $1 } -EnumeratorName :: { StringLexeme } +EnumeratorName :: { Term } EnumeratorName : ID_CONST { $1 } | ID_SUE_TYPE { $1 } @@ -683,11 +686,11 @@ NonNull | nullable '(' Ints ')' { Fix . NonNull [] $3 } | non_null '(' Ints ')' nullable '(' Ints ')' { Fix . NonNull $3 $7 } -Ints :: { [StringLexeme] } +Ints :: { [Term] } Ints : IntList { reverse $1 } -IntList :: { [StringLexeme] } +IntList :: { [Term] } IntList : LIT_INTEGER { [$1] } | IntList ',' LIT_INTEGER { $3 : $1 } @@ -729,8 +732,8 @@ ConstDecl | static const LeafType ID_VAR '=' InitialiserExpr ';' { Fix $ ConstDefn Static $3 $4 $6 } { -type StringLexeme = Lexeme String -type NonTerm = Node StringLexeme +type Term = Lexeme Text +type NonTerm = Node Term tyPointer, tyConst :: NonTerm -> NonTerm tyPointer = Fix . TyPointer @@ -741,14 +744,14 @@ parseError (L (AlexPn _ line col) c t, options) = alexError $ ":" <> show line <> ":" <> show col <> ": Parse error near " <> show c <> ": " <> show t <> "; expected one of " <> show options -lexwrap :: (Lexeme String -> Alex a) -> Alex a +lexwrap :: (Lexeme Text -> Alex a) -> Alex a lexwrap = (alexMonadScan >>=) externC - :: Lexeme String - -> Lexeme String + :: Term + -> Term -> [NonTerm] - -> Lexeme String + -> Term -> Alex NonTerm externC (L _ _ "__cplusplus") (L _ _ "\"C\"") decls (L _ _ "__cplusplus") = return $ Fix $ ExternC decls @@ -758,7 +761,7 @@ externC _ lang _ _ = macroBodyStmt :: NonTerm - -> Lexeme String + -> Term -> Alex NonTerm macroBodyStmt decls (L _ _ "0") = return $ Fix $ MacroBodyStmt decls diff --git a/tools/BUILD.bazel b/tools/BUILD.bazel index 138a919..d17e1b1 100644 --- a/tools/BUILD.bazel +++ b/tools/BUILD.bazel @@ -25,6 +25,19 @@ haskell_binary( ], ) +haskell_binary( + name = "count-tokens", + srcs = ["count-tokens.hs"], + visibility = ["//visibility:public"], + deps = [ + "//hs-cimple", + hazel_library("base"), + hazel_library("bytestring"), + hazel_library("text"), + hazel_library("time"), + ], +) + haskell_binary( name = "dump-tokens", srcs = ["dump-tokens.hs"], @@ -34,7 +47,6 @@ haskell_binary( hazel_library("base"), hazel_library("bytestring"), hazel_library("groom"), - hazel_library("text"), ], ) diff --git a/tools/count-tokens.hs b/tools/count-tokens.hs new file mode 100644 index 0000000..9f3f3d4 --- /dev/null +++ b/tools/count-tokens.hs @@ -0,0 +1,41 @@ +{-# LANGUAGE Strict #-} +module Main (main) where + +import qualified Data.ByteString.Lazy as LBS +import qualified Data.Text as Text +import Data.Time.Clock (diffUTCTime, getCurrentTime) +import Language.Cimple (Alex, Lexeme (..), LexemeClass (..), + alexMonadScan, runAlex) +import System.Environment (getArgs) + + +countTokens :: LBS.ByteString -> Either String (Int, Int) +countTokens str = runAlex str $ loop 0 0 + where + loop :: Int -> Int -> Alex (Int, Int) + loop toks len = do + (L _ c t) <- alexMonadScan + if c == Eof + then return (toks, len) + else loop (toks + 1) (len + Text.length t) + +processFile :: FilePath -> IO (Int, Int) +processFile source = do + contents <- LBS.readFile source + case countTokens contents of + Left err -> fail err + Right ok -> return ok + +processFiles :: [FilePath] -> IO (Int, Int) +processFiles = fmap ((\(a, b) -> (sum a, sum b)) . unzip) . mapM processFile + +main :: IO () +main = do + sources <- getArgs + start <- getCurrentTime + (toks, len) <- processFiles sources + end <- getCurrentTime + putStrLn $ "Tokenised " <> show (length sources) <> " sources into " + <> show toks <> " lexemes (" <> show len <> " bytes) in " + <> show (diffUTCTime end start) <> " (" + <> show (fromIntegral len / 1024 / 1024 / diffUTCTime end start) <> " MiB/s)" diff --git a/tools/dump-tokens.hs b/tools/dump-tokens.hs index cd90a92..5fc829a 100644 --- a/tools/dump-tokens.hs +++ b/tools/dump-tokens.hs @@ -1,23 +1,21 @@ module Main (main) where -import qualified Data.ByteString as BS -import qualified Data.Text as Text -import qualified Data.Text.Encoding as Text -import Language.Cimple (alexScanTokens) -import System.Environment (getArgs) -import Text.Groom (groom) +import qualified Data.ByteString.Lazy as LBS +import Language.Cimple (alexScanTokens) +import System.Environment (getArgs) +import Text.Groom (groom) processFile :: FilePath -> IO () processFile source = do putStrLn $ "Processing " ++ source - contents <- Text.unpack . Text.decodeUtf8 <$> BS.readFile source + contents <- LBS.readFile source case alexScanTokens contents of Left err -> fail err Right ok -> putStrLn $ groom ok main :: IO () main = do - args <- getArgs - case args of - [src] -> processFile src - _ -> fail "Usage: dump-tokens " + args <- getArgs + case args of + [src] -> processFile src + _ -> fail "Usage: dump-tokens "