Skip to content

Commit

Permalink
Merge pull request #50 from ajinabraham/mobsf_queue
Browse files Browse the repository at this point in the history
Split file read + regex scan
  • Loading branch information
ajinabraham authored Nov 13, 2024
2 parents dddf52a + 96ec743 commit ae43889
Show file tree
Hide file tree
Showing 5 changed files with 52 additions and 32 deletions.
2 changes: 1 addition & 1 deletion libsast/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
__title__ = 'libsast'
__authors__ = 'Ajin Abraham'
__copyright__ = f'Copyright {year} Ajin Abraham, opensecurity.in'
__version__ = '3.1.0'
__version__ = '3.1.1'
__version_info__ = tuple(int(i) for i in __version__.split('.'))
__all__ = [
'Scanner',
Expand Down
38 changes: 23 additions & 15 deletions libsast/core_matcher/choice_matcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,40 +31,48 @@ def __init__(self, options: dict) -> None:

def scan(self, paths: list) -> dict:
"""Scan file(s) or directory per rule."""
if not (self.scan_rules and paths):
return
self.validate_rules()

if self.show_progress:
pbar = common.ProgressBar('Choice Match', len(self.scan_rules))
self.scan_rules = pbar.progress_loop(self.scan_rules)

file_contents = self.read_file_contents(paths)
return self.regex_scan(file_contents)

def read_file_contents(self, paths: list) -> list:
"""Load file(s) content."""
if not (self.scan_rules and paths):
return
self.validate_rules()
choice_args = []
for rule in self.scan_rules:
scan_paths = paths
if rule['type'] != 'code' and self.alternative_path:
# Scan only alternative path
scan_paths = [Path(self.alternative_path)]
choice_args.append((scan_paths, rule))
if not choice_args:
return []

# Use ThreadPoolExecutor for reading file contents and
# ProcessPoolExecutor for processing regex
with ThreadPoolExecutor() as io_executor, ProcessPoolExecutor(
max_workers=self.cpu) as cpu_executor:
# Use ThreadPoolExecutor for file reading
with ThreadPoolExecutor() as io_executor:
# Submit file reading tasks and wait for results
futures = []
for args_tuple in choice_args:
# Submit each read task and store the future along with the args
future = io_executor.submit(
self._read_file_contents, args_tuple)
futures.append((future, args_tuple))
futures.append(future)
return [future.result() for future in futures]

def regex_scan(self, file_contents) -> list:
"""Process regex matches on the file contents."""
# Use ProcessPoolExecutor for regex processing
with ProcessPoolExecutor(max_workers=self.cpu) as cpu_executor:

results = []
for future, _ in futures:
file_contents = future.result()
# This will block until the file reading is done
# Process the file contents with ProcessPoolExecutor
for content in file_contents:
# Process Choice Matcher on the file contents
process_future = cpu_executor.submit(
self.choice_matcher, file_contents)
self.choice_matcher, content)
results.append(process_future.result())

self.add_finding(results)
Expand Down
26 changes: 19 additions & 7 deletions libsast/core_matcher/pattern_matcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,27 +31,39 @@ def __init__(self, options: dict) -> None:

def scan(self, paths: list) -> dict:
"""Scan file(s) or directory."""
if not (self.scan_rules and paths):
return
self.validate_rules()

if self.show_progress:
pbar = common.ProgressBar('Pattern Match', len(paths))
paths = pbar.progress_loop(paths)

file_contents = self.read_file_contents(paths)
return self.regex_scan(file_contents)

def read_file_contents(self, paths: list) -> list:
"""Load file(s) content."""
if not (self.scan_rules and paths):
return
self.validate_rules()

# Filter files by extension and size, prepare list for processing
files_to_scan = {
sfile for sfile in paths
if is_file_valid(sfile, self.exts, 5)
}
if not files_to_scan:
return []

# Use a ThreadPool for file reading, and ProcessPool for CPU-bound regex
with ThreadPoolExecutor() as io_executor, ProcessPoolExecutor(
max_workers=self.cpu) as cpu_executor:
# Use a ThreadPool for file reading
with ThreadPoolExecutor() as io_executor:

# Read all files
file_contents = list(io_executor.map(
self._read_file_content, files_to_scan))
return file_contents

def regex_scan(self, file_contents: list) -> dict:
"""Scan file(s) content."""
# Use a ProcessPool for CPU-bound regex
with ProcessPoolExecutor(max_workers=self.cpu) as cpu_executor:

# Run regex on file data
results = cpu_executor.map(
Expand Down
16 changes: 8 additions & 8 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "libsast"
version = "3.1.0"
version = "3.1.1"
description = "A generic SAST library built on top of semgrep and regex"
keywords = ["libsast", "SAST", "Python SAST", "SAST API", "Regex SAST", "Pattern Matcher"]
authors = ["Ajin Abraham <ajin@opensecurity.in>"]
Expand Down

0 comments on commit ae43889

Please sign in to comment.