Skip to content

Commit

Permalink
Merge pull request #15 from cicirello/development
Browse files Browse the repository at this point in the history
Fully re-implemented in Python
  • Loading branch information
cicirello authored Sep 11, 2020
2 parents d326e8b + 98e2468 commit d4ae654
Show file tree
Hide file tree
Showing 6 changed files with 157 additions and 86 deletions.
1 change: 0 additions & 1 deletion .dockerignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
*
!Dockerfile
!entrypoint.sh
!generatesitemap.py
2 changes: 2 additions & 0 deletions .github/workflows/build-and-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ jobs:

steps:
- uses: actions/checkout@v2
with:
fetch-depth: 0

- name: Setup Python
uses: actions/setup-python@v2
Expand Down
3 changes: 1 addition & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,5 @@
# Licensed under the MIT License
FROM cicirello/alpine-plus-plus:latest
RUN apk add --no-cache --update python3
COPY entrypoint.sh /entrypoint.sh
COPY generatesitemap.py /generatesitemap.py
ENTRYPOINT ["/entrypoint.sh"]
ENTRYPOINT ["/generatesitemap.py"]
78 changes: 0 additions & 78 deletions entrypoint.sh

This file was deleted.

99 changes: 94 additions & 5 deletions generatesitemap.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
import sys
import re
import os
import subprocess

def gatherfiles(html, pdf) :
"""Walks the directory tree discovering
Expand Down Expand Up @@ -72,7 +73,7 @@ def urlsort(files) :
files - list of files to include in sitemap
"""
files.sort(key = lambda f : sortname(f))
files.sort(key = lambda s : s.count("/"))
files.sort(key = lambda f : f.count("/"))

def hasMetaRobotsNoindex(f) :
"""Checks whether an html file contains
Expand Down Expand Up @@ -110,10 +111,98 @@ def robotsBlocked(f) :
return False
return hasMetaRobotsNoindex(f)

def lastmod(f) :
"""Determines the date when the file was last modified and
returns a string with the date formatted as required for
the lastmod tag in an xml sitemap.
Keyword arguments:
f - filename
"""
return subprocess.run(['git', 'log', '-1', '--format=%cI', f],
stdout=subprocess.PIPE,
universal_newlines=True).stdout.strip()

def urlstring(f, baseUrl) :
"""Forms a string with the full url from a filename and base url.
Keyword arguments:
f - filename
baseUrl - address of the root of the website
"""
if f[0]=="." :
u = f[1:]
else :
u = f
if len(u) >= 10 and u[-10:] == "index.html" :
u = u[:-10]
if len(u) >= 1 and u[0]=="/" and len(baseUrl) >= 1 and baseUrl[-1]=="/" :
u = u[1:]
elif (len(u)==0 or u[0]!="/") and (len(baseUrl)==0 or baseUrl[-1]!="/") :
u = "/" + u
return baseUrl + u

def xmlSitemapEntry(f, baseUrl, dateString) :
"""Forms a string with an entry formatted for an xml sitemap
including lastmod date.
Keyword arguments:
f - filename
baseUrl - address of the root of the website
dateString - lastmod date correctly formatted
"""
return "<url>\n<loc>" + urlstring(f, baseUrl) + "</loc>\n<lastmod>" + dateString + "</lastmod>\n</url>"

def writeTextSitemap(files, baseUrl) :
"""Writes a plain text sitemap to the file sitemap.txt.
Keyword Arguments:
files - a list of filenames
baseUrl - the base url to the root of the website
"""
with open("sitemap.txt", "w") as sitemap :
for f in files :
sitemap.write(urlstring(f, baseUrl))
sitemap.write("\n")

def writeXmlSitemap(files, baseUrl) :
"""Writes an xml sitemap to the file sitemap.xml.
Keyword Arguments:
files - a list of filenames
baseUrl - the base url to the root of the website
"""
with open("sitemap.xml", "w") as sitemap :
sitemap.write('<?xml version="1.0" encoding="UTF-8"?>\n')
sitemap.write('<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">\n')
for f in files :
sitemap.write(xmlSitemapEntry(f, baseUrl, lastmod(f)))
sitemap.write("\n")
sitemap.write('</urlset>\n')

if __name__ == "__main__" :
allFiles = gatherfiles(sys.argv[1]=="true", sys.argv[2]=="true")
websiteRoot = sys.argv[1]
baseUrl = sys.argv[2]
includeHTML = sys.argv[3]=="true"
includePDF = sys.argv[4]=="true"
sitemapFormat = sys.argv[5]

os.chdir(websiteRoot)

allFiles = gatherfiles(includeHTML, includePDF)
files = [ f for f in allFiles if not robotsBlocked(f) ]
urlsort(files)
for f in files :
print(f)
print("RobotsBlockedCount:",len(allFiles)-len(files))

pathToSitemap = websiteRoot
if pathToSitemap[-1] != "/" :
pathToSitemap += "/"
if sitemapFormat == "xml" :
writeXmlSitemap(files, baseUrl)
pathToSitemap += "sitemap.xml"
else :
writeTextSitemap(files, baseUrl)
pathToSitemap += "sitemap.txt"

print("::set-output name=sitemap-path::" + pathToSitemap)
print("::set-output name=url-count::" + str(len(files)))
print("::set-output name=excluded-count::" + str(len(allFiles)-len(files)))
60 changes: 60 additions & 0 deletions tests/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,3 +164,63 @@ def test_gatherfiles_pdf(self) :
"./subdir/subdir/z.pdf"}
self.assertEqual(asSet, expected)

def test_lastmod(self) :
def validateDate(s) :
if not s[0:4].isdigit() or s[4]!="-" or not s[5:7].isdigit() :
return False
if s[7]!="-" or not s[8:10].isdigit() or s[10]!="T" :
return False
if not s[11:13].isdigit() or s[13]!=":" or not s[14:16].isdigit() :
return False
if s[16]!=":" or not s[17:19].isdigit() or s[19]!="-" :
return False
if not s[20:22].isdigit() or s[22]!=":" or not s[23:25].isdigit() :
return False
return True
os.chdir("tests")
self.assertTrue(gs.lastmod("./unblocked1.html"))
self.assertTrue(gs.lastmod("./subdir/a.html"))
os.chdir("..")

def test_urlstring(self) :
filenames = [ "./a.html",
"./index.html",
"./subdir/a.html",
"./subdir/index.html",
"./subdir/subdir/a.html",
"./subdir/subdir/index.html",
"/a.html",
"/index.html",
"/subdir/a.html",
"/subdir/index.html",
"/subdir/subdir/a.html",
"/subdir/subdir/index.html",
"a.html",
"index.html",
"subdir/a.html",
"subdir/index.html",
"subdir/subdir/a.html",
"subdir/subdir/index.html"
]
base1 = "https://TESTING.FAKE.WEB.ADDRESS.TESTING/"
base2 = "https://TESTING.FAKE.WEB.ADDRESS.TESTING"
expected = [ "https://TESTING.FAKE.WEB.ADDRESS.TESTING/a.html",
"https://TESTING.FAKE.WEB.ADDRESS.TESTING/",
"https://TESTING.FAKE.WEB.ADDRESS.TESTING/subdir/a.html",
"https://TESTING.FAKE.WEB.ADDRESS.TESTING/subdir/",
"https://TESTING.FAKE.WEB.ADDRESS.TESTING/subdir/subdir/a.html",
"https://TESTING.FAKE.WEB.ADDRESS.TESTING/subdir/subdir/"
]
for i, f in enumerate(filenames) :
self.assertEqual(expected[i%len(expected)], gs.urlstring(f, base1))
self.assertEqual(expected[i%len(expected)], gs.urlstring(f, base2))

def test_xmlSitemapEntry(self) :
base = "https://TESTING.FAKE.WEB.ADDRESS.TESTING/"
f = "./a.html"
date = "2020-09-11T13:35:00-04:00"
actual = gs.xmlSitemapEntry(f, base, date)
expected = "<url>\n<loc>https://TESTING.FAKE.WEB.ADDRESS.TESTING/a.html</loc>\n<lastmod>2020-09-11T13:35:00-04:00</lastmod>\n</url>"
self.assertEqual(actual, expected)


0 comments on commit d4ae654

Please sign in to comment.