Skip to content

Commit

Permalink
Merge pull request #1 from DEFI-COLaF/visualisationHTML
Browse files Browse the repository at this point in the history
Visualisation html
  • Loading branch information
Juliettejns authored Sep 19, 2024
2 parents 102e6ff + 48250d4 commit dfa0c78
Show file tree
Hide file tree
Showing 149 changed files with 205,113 additions and 77,105 deletions.
38 changes: 38 additions & 0 deletions .github/workflows/htmlgenerator.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
name: HTMLgenerator

on:
push:
branches:
- visualisationHTML
- main

jobs:
createFile:
runs-on: ubuntu-latest
permissions:
contents: write

steps:
- uses: actions/checkout@v2
- name: Set up Python
uses: actions/setup-python@v2

- name: Set up Python (if you use Python for scripts)
uses: actions/setup-python@v4
with:
python-version: '3.x'

- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install lxml
- name: Create HTML pages
run: |
python3 XML2HTML.py
git config --global user.name "GitHub Actions"
git config --global user.email "actions@github.com"
git add .
git commit -m "Update files"
git push
env:
GITHUB_TOKEN: ${{ secrets.SECRET_GITHUB_TOKEN }}
3 changes: 0 additions & 3 deletions .idea/.gitignore

This file was deleted.

14 changes: 0 additions & 14 deletions .idea/Molye.iml

This file was deleted.

6 changes: 0 additions & 6 deletions .idea/inspectionProfiles/profiles_settings.xml

This file was deleted.

7 changes: 0 additions & 7 deletions .idea/misc.xml

This file was deleted.

8 changes: 0 additions & 8 deletions .idea/modules.xml

This file was deleted.

6 changes: 0 additions & 6 deletions .idea/vcs.xml

This file was deleted.

2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,5 @@ where we store the full documents used to create the Molyé timeline. The datase
which is mainly a by-product of the compilation procedure varying according to the type of work.

To find examples of a specific language, try searching xml:lang="[lang-code]". For convenience we have also included some pre-compiled subcorpora in the subcorpora folder. To access them, first go to the subfolder for the relevant time period and then find the file with the correct lang-code.

The HTML version of the dataset is available [here](https://defi-colaf.github.io/Molye/dataset_HTML/index.html).
147 changes: 147 additions & 0 deletions XML2HTML.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
import os
from lxml import etree

# Paths for your XML files, XSL file, and output HTML folder
xml_folder = 'dataset_colaf/'
xsl_file = 'XML2HTML_doc.xsl'
output_folder = 'dataset_HTML/'
index_file = os.path.join(output_folder, 'index.html')

# Create the output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# Load the XSLT file
xslt_root = etree.parse(xsl_file)
transform = etree.XSLT(xslt_root)

content=[]

# Iterate over each XML file in the directory
for xml_subfolder in os.listdir(xml_folder):
for xml_filename in os.listdir(xml_folder+'/'+xml_subfolder):
if xml_filename.endswith('.xml'):
xml_path = os.path.join(xml_folder, xml_subfolder, xml_filename)
html_filename = os.path.splitext(xml_filename)[0] + '.html'
output_html_path = os.path.join(output_folder, html_filename)

# Parse the XML file
xml_root = etree.parse(xml_path)

# Transform the XML to HTML using the XSL file
result_tree = transform(xml_root)

# Save the HTML result
with open(output_html_path, 'wb') as f:
f.write(etree.tostring(result_tree, pretty_print=True, method='html'))

# Extract the title from the HTML (assuming <title> element is created)
title = xml_root.find('.//tei:title[@type="main"]', namespaces={"tei":"http://www.tei-c.org/ns/1.0"}).text
date = xml_root.find('.//tei:bibl[@type="PrintSource"]/tei:date', namespaces={"tei": "http://www.tei-c.org/ns/1.0"}).attrib['when']
langue_xml = xml_root.findall('.//tei:language/tei:name', namespaces={"tei": "http://www.tei-c.org/ns/1.0"})
langue=[]
for lang in langue_xml:
if 'Metropolitan France' not in lang.text:
langue.append(lang.text)
keywords_xml = xml_root.findall('.//tei:term', namespaces={"tei":"http://www.tei-c.org/ns/1.0"})
keywords = []
for key in keywords_xml:
keywords.append(key.text)
# Append the title and corresponding HTML file to the list for the index
xml_path = xml_folder+xml_subfolder+'/'+xml_filename
content.append([title, html_filename, langue, keywords, xml_path, xml_subfolder, date])

# Create the page d'accueil (index.html)
with open(index_file, 'w', encoding='utf-8') as f:
f.write("""<!DOCTYPE html>\n<html>\n<head>\n<title>Corpus Page d\'Accueil</title><style>
table {
width: 100%;
border-collapse: collapse;
margin: 25px 0;
font-size: 18px;
font-family: Arial, sans-serif;
text-align: left;
}
th, td {
padding: 12px 15px;
border: 1px solid #ddd;
}
th {
background-color: #4CAF50;
color: white;
}
tr:nth-child(even) {
background-color: #f2f2f2;
}
tr:hover {
background-color: #ddd;
}
caption {
caption-side: top;
font-size: 1.5em;
margin-bottom: 10px;
font-weight: bold;
}
body {
font-family: Arial, sans-serif;
line-height: 1.6;
margin: 20px;
background-color: #f4f4f4;
color: #333;
}
h1 {
color: #2c3e50;
font-size: 2.5em;
text-align: center;
margin-bottom: 20px;
}
p {
font-size: 1.2em;
margin-bottom: 20px;
}
a {
color: #2980b9;
text-decoration: none;
}
a:hover {
text-decoration: underline;
}
.container {
max-width: 800px;
margin: auto;
padding: 20px;
background-color: #fff;
border-radius: 8px;
box-shadow: 0 0 10px rgba(0, 0, 0, 0.1);
}
</style>""")
f.write("""<h1>Molyé Corpus</h1>\n<div class="container"><p>The Molyé corpus is a diachronic collection of stereotypical representation of French language variation during the early modern period as well as early attestations of French Creoles.</p>
<p>The goal of the project is to demonstrate that several Creole features which are posited to be the result of pidginization in French colonies can in fact be traced to Europe, both individually and in combination with each other.</p>
<p>Below is a spreadsheet detailing both <a href="https://docs.google.com/spreadsheets/d/e/2PACX-1vS43h47Gq3x6F_qt5DoByeztvZJSkifAOsuLNuVsbawh8KHlrpRbX29CAEsruipmLZd7ugA3_GoQDZk/pubhtml">the contents of the Molyé corpus proper and the full list of 250+ documents</a> that were found in the corpus creation process.</p>
<p>This work has been done as part of the <a href="https://colaf.huma-num.fr/">COlaF</a> project, Corpus and Tools for the Languages of France, developed by <a href="https://www.inria.fr/en">Inria</a>.</p>
</div>
<table><caption>Corpus Content\n""")
f.write("""<thead>
<tr>
<th>SubCorpus
</th><th>Title</th>
<th>Date</th>
<th>Languages</th>
<th>Genres</th>
<th>Links</th>
</tr></thead><tbody>\n""") # Table headers

content_sorted_by_date = sorted(content, key=lambda x: x[6])
for sublist in content_sorted_by_date:
f.write(f'<tr><td>{sublist[5]}</td><td>{sublist[0]}<td>{sublist[6]}</td><td>{",".join(sublist[2])}</td><td>{",".join(sublist[3])}</td><td><a href="{sublist[1]}">HTML</a>,<a href="https://github.com/DEFI-COLaF/Molye/tree/main/{sublist[4]}">XML</a></td></tr>\n')

f.write("""<tbody></table></body>\n</html>""")
Loading

0 comments on commit dfa0c78

Please sign in to comment.