Skip to content

Commit

Permalink
Rm unused code, fixed requirements
Browse files Browse the repository at this point in the history
  • Loading branch information
sveinbjornt committed Oct 17, 2023
1 parent 9c7d637 commit 9f1ab73
Show file tree
Hide file tree
Showing 4 changed files with 5 additions and 123 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,7 @@ ENV/
env.bak/
venv.bak/
p36*
venv

# Spyder project settings
.spyderproject
Expand Down
118 changes: 0 additions & 118 deletions db/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,124 +74,6 @@ def __eq__(self, other: object) -> bool: # type: ignore
setattr(Base, "table", classmethod(lambda cls: cls.__table__))


class Root(Base):
"""Represents a scraper root, i.e. a base domain and root URL"""

__tablename__ = "roots"

# Primary key
id = Column(Integer, Sequence("roots_id_seq"), primary_key=True)

# Domain suffix, root URL, human-readable description
domain = Column(String, nullable=False)
url = Column(String, nullable=False)
description = Column(String)

# Default author
author = Column(String)
# Default authority of this source, 1.0 = most authoritative, 0.0 = least authoritative
authority = Column(Float)
# Finish time of last scrape of this root
scraped = Column(DateTime, index=True)
# Module to use for scraping
scr_module = Column(String(80))
# Class within module to use for scraping
scr_class = Column(String(80))
# Are articles of this root visible on the Greynir web?
visible = Column(Boolean, default=True)
# Should articles of this root be scraped automatically?
scrape = Column(Boolean, default=True)

# The combination of domain + url must be unique
__table_args__ = (UniqueConstraint("domain", "url"),)

def __repr__(self):
return "Root(domain='{0}', url='{1}', description='{2}')".format(
self.domain, self.url, self.description
)


class Article(Base):
"""Represents an article from one of the roots, to be scraped or having already been scraped"""

__tablename__ = "articles"

# The article URL is the primary key
url = Column(String, primary_key=True)

# UUID
id = Column(
psql_UUID(as_uuid=False),
index=True,
nullable=False,
unique=True,
server_default=text("uuid_generate_v1()"),
)

# Foreign key to a root
root_id = cast(
Optional[int],
Column(
Integer,
# We don't delete associated articles if the root is deleted
ForeignKey("roots.id", onupdate="CASCADE", ondelete="SET NULL"),
),
)

# Article heading, if known
heading = Column(String)
# Article author, if known
author = Column(String)
# Article time stamp, if known
timestamp = Column(DateTime, index=True)

# Authority of this article, 1.0 = most authoritative, 0.0 = least authoritative
authority = Column(Float)
# Time of the last scrape of this article
scraped = Column(DateTime, index=True)
# Time of the last parse of this article
parsed = Column(DateTime, index=True)
# Time of the last processing of this article
processed = Column(DateTime, index=True)
# Time of the last indexing of this article
indexed = Column(DateTime, index=True)
# Module used for scraping
scr_module = Column(String(80))
# Class within module used for scraping
scr_class = Column(String(80))
# Version of scraper class
scr_version = Column(String(16))
# Version of parser/grammar/config
parser_version = Column(String(32))
# Parse statistics
num_sentences = Column(Integer)
num_parsed = Column(Integer)
ambiguity = Column(Float)

# The HTML obtained in the last scrape
html = Column(String)
# The parse tree obtained in the last parse
tree = Column(String)
# The tokens of the article in JSON string format
tokens = Column(String)
# The article topic vector as an array of floats in JSON string format
topic_vector = Column(String)

# The back-reference to the Root parent of this Article
# Modify this to RelationshipProperty[Root] once Pylance, Mypy and Python 3.6
# settle their differences
root: RelationshipProperty = relationship( # type: ignore
"Root",
foreign_keys="Article.root_id",
backref=backref("articles", order_by=url), # type: ignore
)

def __repr__(self):
return "Article(url='{0}', heading='{1}', scraped={2})".format(
self.url, self.heading, self.scraped
)


class Entity(Base):
"""Represents a named entity"""

Expand Down
3 changes: 1 addition & 2 deletions doc.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,6 @@ class DocxDocument(Document):
BREAK_TAG = WORD_NAMESPACE + "br"

def extract_text(self) -> str:

zipfile = ZipFile(BytesIO(self.data), "r")

# Verify that archive contains document.xml
Expand Down Expand Up @@ -229,7 +228,7 @@ def extract_text(self) -> str:
"application/x-pdf": PDFDocument,
"application/rtf": RTFDocument,
"application/vnd.oasis.opendocument.text": ODTDocument,
# Yes, really!
# Yes, really! Mime type naming by committee...
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": DocxDocument,
}

Expand Down
6 changes: 3 additions & 3 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
Flask==2.3.2
Flask==2.3.3
flask-caching>=2.0.2
Flask-Cors>=3.0.10
SQLAlchemy==1.4.48
SQLAlchemy==1.4.49
sqlalchemy2-stubs>=0.0.2a30
psycopg2cffi==2.9.0
reynir-correct>=3.4.6
reynir-correct==3.4.6
striprtf>=0.0.22
defusedxml>=0.7.1
cachetools>=5.3.0
Expand Down

0 comments on commit 9f1ab73

Please sign in to comment.