Rm unused code, fixed requirements

mideind · Oct 17, 2023 · 9f1ab73 · 9f1ab73
1 parent 9c7d637
commit 9f1ab73
Show file tree

Hide file tree

Showing 4 changed files with 5 additions and 123 deletions.
diff --git a/.gitignore b/.gitignore
@@ -116,6 +116,7 @@ ENV/
 env.bak/
 venv.bak/
 p36*
+venv
 
 # Spyder project settings
 .spyderproject

diff --git a/db/models.py b/db/models.py
@@ -74,124 +74,6 @@ def __eq__(self, other: object) -> bool:  # type: ignore
 setattr(Base, "table", classmethod(lambda cls: cls.__table__))
 
 
-class Root(Base):
-    """Represents a scraper root, i.e. a base domain and root URL"""
-
-    __tablename__ = "roots"
-
-    # Primary key
-    id = Column(Integer, Sequence("roots_id_seq"), primary_key=True)
-
-    # Domain suffix, root URL, human-readable description
-    domain = Column(String, nullable=False)
-    url = Column(String, nullable=False)
-    description = Column(String)
-
-    # Default author
-    author = Column(String)
-    # Default authority of this source, 1.0 = most authoritative, 0.0 = least authoritative
-    authority = Column(Float)
-    # Finish time of last scrape of this root
-    scraped = Column(DateTime, index=True)
-    # Module to use for scraping
-    scr_module = Column(String(80))
-    # Class within module to use for scraping
-    scr_class = Column(String(80))
-    # Are articles of this root visible on the Greynir web?
-    visible = Column(Boolean, default=True)
-    # Should articles of this root be scraped automatically?
-    scrape = Column(Boolean, default=True)
-
-    # The combination of domain + url must be unique
-    __table_args__ = (UniqueConstraint("domain", "url"),)
-
-    def __repr__(self):
-        return "Root(domain='{0}', url='{1}', description='{2}')".format(
-            self.domain, self.url, self.description
-        )
-
-
-class Article(Base):
-    """Represents an article from one of the roots, to be scraped or having already been scraped"""
-
-    __tablename__ = "articles"
-
-    # The article URL is the primary key
-    url = Column(String, primary_key=True)
-
-    # UUID
-    id = Column(
-        psql_UUID(as_uuid=False),
-        index=True,
-        nullable=False,
-        unique=True,
-        server_default=text("uuid_generate_v1()"),
-    )
-
-    # Foreign key to a root
-    root_id = cast(
-        Optional[int],
-        Column(
-            Integer,
-            # We don't delete associated articles if the root is deleted
-            ForeignKey("roots.id", onupdate="CASCADE", ondelete="SET NULL"),
-        ),
-    )
-
-    # Article heading, if known
-    heading = Column(String)
-    # Article author, if known
-    author = Column(String)
-    # Article time stamp, if known
-    timestamp = Column(DateTime, index=True)
-
-    # Authority of this article, 1.0 = most authoritative, 0.0 = least authoritative
-    authority = Column(Float)
-    # Time of the last scrape of this article
-    scraped = Column(DateTime, index=True)
-    # Time of the last parse of this article
-    parsed = Column(DateTime, index=True)
-    # Time of the last processing of this article
-    processed = Column(DateTime, index=True)
-    # Time of the last indexing of this article
-    indexed = Column(DateTime, index=True)
-    # Module used for scraping
-    scr_module = Column(String(80))
-    # Class within module used for scraping
-    scr_class = Column(String(80))
-    # Version of scraper class
-    scr_version = Column(String(16))
-    # Version of parser/grammar/config
-    parser_version = Column(String(32))
-    # Parse statistics
-    num_sentences = Column(Integer)
-    num_parsed = Column(Integer)
-    ambiguity = Column(Float)
-
-    # The HTML obtained in the last scrape
-    html = Column(String)
-    # The parse tree obtained in the last parse
-    tree = Column(String)
-    # The tokens of the article in JSON string format
-    tokens = Column(String)
-    # The article topic vector as an array of floats in JSON string format
-    topic_vector = Column(String)
-
-    # The back-reference to the Root parent of this Article
-    # Modify this to RelationshipProperty[Root] once Pylance, Mypy and Python 3.6
-    # settle their differences
-    root: RelationshipProperty = relationship(  # type: ignore
-        "Root",
-        foreign_keys="Article.root_id",
-        backref=backref("articles", order_by=url),  # type: ignore
-    )
-
-    def __repr__(self):
-        return "Article(url='{0}', heading='{1}', scraped={2})".format(
-            self.url, self.heading, self.scraped
-        )
-
-
 class Entity(Base):
     """Represents a named entity"""
 

diff --git a/doc.py b/doc.py
@@ -178,7 +178,6 @@ class DocxDocument(Document):
     BREAK_TAG = WORD_NAMESPACE + "br"
 
     def extract_text(self) -> str:
-
         zipfile = ZipFile(BytesIO(self.data), "r")
 
         # Verify that archive contains document.xml
@@ -229,7 +228,7 @@ def extract_text(self) -> str:
     "application/x-pdf": PDFDocument,
     "application/rtf": RTFDocument,
     "application/vnd.oasis.opendocument.text": ODTDocument,
-    # Yes, really!
+    # Yes, really! Mime type naming by committee...
     "application/vnd.openxmlformats-officedocument.wordprocessingml.document": DocxDocument,
 }
 

diff --git a/requirements.txt b/requirements.txt
@@ -1,10 +1,10 @@
-Flask==2.3.2
+Flask==2.3.3
 flask-caching>=2.0.2
 Flask-Cors>=3.0.10
-SQLAlchemy==1.4.48
+SQLAlchemy==1.4.49
 sqlalchemy2-stubs>=0.0.2a30
 psycopg2cffi==2.9.0
-reynir-correct>=3.4.6
+reynir-correct==3.4.6
 striprtf>=0.0.22
 defusedxml>=0.7.1
 cachetools>=5.3.0