Skip to content

Commit

Permalink
Merge branch 'development' of https://github.com/usc-isi-i2/etk into …
Browse files Browse the repository at this point in the history
…development
  • Loading branch information
saggu committed Oct 22, 2018
2 parents 2fce38b + e7dabb1 commit 005caed
Show file tree
Hide file tree
Showing 16 changed files with 438 additions and 33 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ deactivate

For example:

`python -m etk dummy --test "this is a test"`
`python -m etk regex_extractor "a.*c" "abcd"`

## Docker

Expand Down
14 changes: 14 additions & 0 deletions docs/_sources/installation.rst.txt
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,20 @@ Run all ETK unit test::

python -m unittest discover

Run ETK CLI
-----------
ETK needs to be installed as python package::

python -m etk <command> [options]

For example::

python -m etk regex_extractor "a.*c" "abcd"

Chained extractors::

cat ./sample_input/test.html | python -m etk html_content_extractor | python -m etk date_extractor


Build documentation
-------------------
Expand Down
33 changes: 33 additions & 0 deletions etk/cli/bitcoin_address_extractor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import warnings
import sys
import argparse

from etk.extractors.bitcoin_address_extractor import BitcoinAddressExtractor

bitcoin_address_extractor = BitcoinAddressExtractor()


def add_arguments(parser):
"""
Parse arguments
Args:
parser (argparse.ArgumentParser)
"""
parser.description = 'Examples:\n' \
'python -m etk bitcoin_address_extractor /tmp/input.txt\n' \
'cat /tmp/input.txt | python -m etk bitcoin_address_extractor'
parser.add_argument('input_file', nargs='?', type=argparse.FileType('r'), default=sys.stdin)


def run(args):
"""
Args:
args (argparse.Namespace)
"""
with warnings.catch_warnings():
warnings.simplefilter('ignore')

for line in args.input_file:
extractions = bitcoin_address_extractor.extract(line)
for e in extractions:
print(e.value)
33 changes: 33 additions & 0 deletions etk/cli/cryptographic_hash_extractor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import warnings
import sys
import argparse

from etk.extractors.cryptographic_hash_extractor import CryptographicHashExtractor

cryptographic_hash_extractor = CryptographicHashExtractor()


def add_arguments(parser):
"""
Parse arguments
Args:
parser (argparse.ArgumentParser)
"""
parser.description = 'Examples:\n' \
'python -m etk cryptographic_hash_extractor /tmp/input.txt\n' \
'cat /tmp/input.txt | python -m etk cryptographic_hash_extractor'
parser.add_argument('input_file', nargs='?', type=argparse.FileType('r'), default=sys.stdin)


def run(args):
"""
Args:
args (argparse.Namespace)
"""
with warnings.catch_warnings():
warnings.simplefilter('ignore')

for line in args.input_file:
extractions = cryptographic_hash_extractor.extract(line)
for e in extractions:
print(e.value)
33 changes: 33 additions & 0 deletions etk/cli/cve_extractor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import warnings
import sys
import argparse

from etk.extractors.cve_extractor import CVEExtractor

cve_extractor = CVEExtractor()


def add_arguments(parser):
"""
Parse arguments
Args:
parser (argparse.ArgumentParser)
"""
parser.description = 'Examples:\n' \
'python -m etk bitcoin_address_extractor /tmp/input.txt\n' \
'cat /tmp/input.txt | python -m etk bitcoin_address_extractor'
parser.add_argument('input_file', nargs='?', type=argparse.FileType('r'), default=sys.stdin)


def run(args):
"""
Args:
args (argparse.Namespace)
"""
with warnings.catch_warnings():
warnings.simplefilter('ignore')

for line in args.input_file:
extractions = cve_extractor.extract(line)
for e in extractions:
print(e.value)
33 changes: 33 additions & 0 deletions etk/cli/email_extractor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import warnings
import sys
import argparse

from etk.extractors.email_extractor import EmailExtractor

email_extractor = EmailExtractor()


def add_arguments(parser):
"""
Parse arguments
Args:
parser (argparse.ArgumentParser)
"""
parser.description = 'Examples:\n' \
'python -m etk email_extractor /tmp/input.txt\n' \
'cat /tmp/input.txt | python -m etk email_extractor'
parser.add_argument('input_file', nargs='?', type=argparse.FileType('r'), default=sys.stdin)


def run(args):
"""
Args:
args (argparse.Namespace)
"""
with warnings.catch_warnings():
warnings.simplefilter('ignore')

for line in args.input_file:
extractions = email_extractor.extract(line)
for e in extractions:
print(e.value)
33 changes: 33 additions & 0 deletions etk/cli/hostname_extractor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import warnings
import sys
import argparse

from etk.extractors.hostname_extractor import HostnameExtractor

hostname_extractor = HostnameExtractor()


def add_arguments(parser):
"""
Parse arguments
Args:
parser (argparse.ArgumentParser)
"""
parser.description = 'Examples:\n' \
'python -m etk hostname_extractor /tmp/input.txt\n' \
'cat /tmp/input.txt | python -m etk hostname_extractor'
parser.add_argument('input_file', nargs='?', type=argparse.FileType('r'), default=sys.stdin)


def run(args):
"""
Args:
args (argparse.Namespace)
"""
with warnings.catch_warnings():
warnings.simplefilter('ignore')

for line in args.input_file:
extractions = hostname_extractor.extract(line)
for e in extractions:
print(e.value)
32 changes: 32 additions & 0 deletions etk/cli/html_content_extractor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import warnings
import sys
import argparse

from etk.extractors.html_content_extractor import HTMLContentExtractor


def add_arguments(parser):
"""
Parse arguments
Args:
parser (argparse.ArgumentParser)
"""
parser.description = 'Examples:\n' \
'python -m etk html_content_extractor /tmp/input.html\n' \
'cat /tmp/input.html | python -m etk html_content_extractor'
parser.add_argument('input_file', nargs='?', type=argparse.FileType('r'), default=sys.stdin)


def run(args):
"""
Args:
args (argparse.Namespace)
"""
html_content_extractor = HTMLContentExtractor()

with warnings.catch_warnings():
warnings.simplefilter('ignore')

extractions = html_content_extractor.extract(html_text=args.input_file)
for e in extractions:
print(e.value)
33 changes: 33 additions & 0 deletions etk/cli/html_metadate_extractor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import warnings
import sys
import argparse

from etk.extractors.html_metadata_extractor import HTMLMetadataExtractor


def add_arguments(parser):
"""
Parse arguments
Args:
parser (argparse.ArgumentParser)
"""
parser.description = 'Examples:\n' \
'python -m etk regex_extractor pattern /tmp/date.txt\n' \
'cat /tmp/date.txt | python -m etk regex_extractor pattern'
parser.add_argument('pattern', nargs='?', type=str, default=sys.stdin)
parser.add_argument('input_file', nargs='?', type=argparse.FileType('r'), default=sys.stdin)


def run(args):
"""
Args:
args (argparse.Namespace)
"""
html_metadata_extractor = HTMLMetadataExtractor()

with warnings.catch_warnings():
warnings.simplefilter('ignore')

extractions = html_metadata_extractor.extract(html_text=args.input_file)
for e in extractions:
print(e.value)
33 changes: 33 additions & 0 deletions etk/cli/ip_address_extractor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import warnings
import sys
import argparse

from etk.extractors.ip_address_extractor import IPAddressExtractor

ip_address_extractor = IPAddressExtractor()


def add_arguments(parser):
"""
Parse arguments
Args:
parser (argparse.ArgumentParser)
"""
parser.description = 'Examples:\n' \
'python -m etk ip_address_extractor /tmp/input.txt\n' \
'cat /tmp/input.txt | python -m etk ip_address_extractor'
parser.add_argument('input_file', nargs='?', type=argparse.FileType('r'), default=sys.stdin)


def run(args):
"""
Args:
args (argparse.Namespace)
"""
with warnings.catch_warnings():
warnings.simplefilter('ignore')

for line in args.input_file:
extractions = ip_address_extractor.extract(line)
for e in extractions:
print(e.value)
34 changes: 34 additions & 0 deletions etk/cli/regex_extractor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import warnings
import sys
import argparse

from etk.extractors.regex_extractor import RegexExtractor


def add_arguments(parser):
"""
Parse arguments
Args:
parser (argparse.ArgumentParser)
"""
parser.description = 'Examples:\n' \
'python -m etk regex_extractor pattern /tmp/date.txt\n' \
'cat /tmp/date.txt | python -m etk regex_extractor pattern'
parser.add_argument('pattern', nargs='?', type=str, default=sys.stdin)
parser.add_argument('input_file', nargs='?', type=argparse.FileType('r'), default=sys.stdin)


def run(args):
"""
Args:
args (argparse.Namespace)
"""
regex_extractor = RegexExtractor(pattern=args.pattern)

with warnings.catch_warnings():
warnings.simplefilter('ignore')

for line in args.input_file:
extractions = regex_extractor.extract(line)
for e in extractions:
print(e.value)
32 changes: 32 additions & 0 deletions etk/cli/sample_input/test.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
<html>
<head>
<title>
15 Sep 2018: The Dormouse's story
</title>
</head>
<body>
<p class="title">
<b>
15 Sep 2018: The Dormouse's story
</b>
</p>
<p class="story">
10 days ago, there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">
Elsie
</a>
,
<a class="sister" href="http://example.com/lacie" id="link2">
Lacie
</a>
and
<a class="sister" href="http://example.com/tillie" id="link2">
Tillie
</a>
; and they lived at the bottom of a well.
</p>
<p class="story">
...
</p>
</body>
</html>
Loading

0 comments on commit 005caed

Please sign in to comment.