Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add tool to convert Google Docs to Markdown #105

Merged
merged 1 commit into from
Oct 29, 2021
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
242 changes: 242 additions & 0 deletions _minutes/export-minutes.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,242 @@
<!DOCTYPE html>
<!--
This is a tool to convert the minutes from Google Docs to Github-flavored markdown.
It is designed for use with https://github.com/w3c/webextensions
and only supports the (standard Google Docs) syntax from
https://docs.google.com/document/d/1QkwhEMtMS67JBUkl_WVPZ4lRSKoWcQNlLJSf_GwSXg8/edit

Questions? Ask rob@robwu.nl
-->
<head>
<meta charset="utf-8">
<title>WECG minutes converter - from Google Docs to Markdown</title>
<style>
html, body {
height: 100%;
margin: 0;
padding: 0;
}
body {
display: flex;
flex-direction: column;
}
#extraInfoOutput {
white-space: pre-wrap;
height: 7em;
}
#input, #output {
flex: 1;
overflow: auto;
background: lightgrey;
}
</style>
</head>
<body>
<div>Select the text in Google Docs and Paste the contents below:</div>
<div id="input" contenteditable></div>
<div>
<input type="button" id="convert" value="Convert above paste from Google Doc to (Github-flavored) markdown">
</div>
<div id="extraInfoOutput"></div>
<textarea id="output" placeholder="Markdown output appears here"></textarea>
<script>
var input = document.getElementById("input");
var output = document.getElementById("output");
var extraInfoOutput = document.getElementById("extraInfoOutput");
var convert = document.getElementById("convert");

convert.onclick = function() {
let markdownText = convertToMarkdown(input);
output.value = markdownText;
let issues = new Set();
let prs = new Set();
let mentionedWithoutLink = new Set();
let pat = /https:\/\/github\.com\/w3c\/webextensions\/(issues|pull)\/(\d+)/g, match;
while ((match = pat.exec(markdownText)) !== null) {
let [, issueOrPr, issueNr] = match;
if (issueOrPr === "pull") {
prs.add(issueNr);
} else {
issues.add(issueNr);
}
}
pat = /\sissue (\d+)/gi;
while ((match = pat.exec(markdownText)) !== null) {
let [, issueNr] = match;
if (!issues.has(issueNr) && !prs.has(issueNr)) {
mentionedWithoutLink.add(issueNr);
}
}
function serializeIssues(issueNrs) {
return Array.from(issueNrs, issueNr => `#${issueNr}`).join(", ") || "-";
}
extraInfoOutput.textContent = `
List of issues/PRs in order of appearance in the input:
- Issues: ${serializeIssues(issues)}
- PRs: ${serializeIssues(prs)}
- Mentioned issues without link to issue: ${serializeIssues(mentionedWithoutLink)}`;
};

/**
This formatter does the following:

- Apply code formatting.
- Replace < with &lt;
- Replace * and _ with \* and \_.
- Replace boldfaced with **xx**
- Replace italic with _xx_
- Replace links with [text](anchor)
- Replace h1, h2, h3 with #, ## and ###
- Format h1 header for consistency.
- Replace ol,ul and li with correctly indented list items.
- Fixup whitespace.
*/
function convertToMarkdown(elemRootInput) {
let root = elemRootInput.cloneNode(true);

// Apply code formatting first, before escaping characters.
for (let c of root.querySelectorAll(`span[style*="font-family:'Courier New'"]`)) {
c.prepend("`");
c.append("`");
// replaceAllInTextNodes skips ` only if they are in the same text node.
c.normalize();
}

// Escape < to avoid rendering as HTML.
replaceAllInTextNodes(root, "<", "&lt;");

// Replace all unescaped _ and * with escaped ones to avoid undesired formatting.
replaceAllInTextNodes(root, /(?<=\s|^)[*_]|[*_](?=\s|$)/g, "\\$&");

// Apply boldfaced appearance.
for (let b of root.querySelectorAll(`span[style*="font-weight:700"]`)) {
b.prepend("**");
b.append("**");
}

// Apply italic appearance.
for (let i of root.querySelectorAll(`span[style*="font-style:italic"]`)) {
i.prepend("_");
i.append("_");
}

// Render links.
for (let a of root.querySelectorAll("a[href]")) {
if (a.href === a.textContent.trim()) {
continue;
}
let href = a.href.replaceAll(")", "%29");
a.prepend("[");
a.append(`](${href})`);
}

// Format headers
for (let h of root.querySelectorAll("h1")) {
// Replace header:
// WECG Meetings 2021, Public Notes—Oct 28, 2021
// WECG Meetings 2021, Public Notes, Oct 28
replaceAllInTextNodes(
h,
/(WECG Meetings \d{4}, Public Notes)—([A-Za-z]+ \d{1,2}), \d{4}/g,
"$1, $2"
);
h.prepend(`\n# `);
}
for (let h of root.querySelectorAll("h2")) {
h.prepend(`\n## `);
}
for (let h of root.querySelectorAll("h3")) {
h.prepend(`\n### `);
}

for (let li of root.querySelectorAll("li")) {
let level = 0;
for (let parentNode = li.parentNode; parentNode !== root; parentNode = parentNode.parentNode) {
if (parentNode.tagName === "OL" || parentNode.tagName === "UL") {
++level;
}
}
let listItems = Array.from(li.parentNode.children).filter(e => e.tagName === "LI");
let listIndex = listItems.indexOf(li) + 1;

// Top-level (level 1) has no extra indentation, other levels 2 spaces per level.
let prefix = " ".repeat(level - 1);
if (li.parentNode.tagName === "OL") {
prefix += ` ${listIndex}. `;
} else {
prefix += " * ";
}
li.prepend(prefix);
let isNewList = li.parentNode.previousElementSibling?.tagName !== li.parentNode.tagName;
if (level === 1 && listIndex === 1 && isNewList) {
// Insert blank line before top-level list.
li.before("\n");
}
}

// Forced line break after every paragraph and br.
for (let elem of root.querySelectorAll("p, br")) {
elem.after("\n");
}
// Blank line after every header.
for (let elem of root.querySelectorAll("h1,h2,h3")) {
elem.after("\n\n");
}

let textContent = root.textContent;

// Normalize ’ to '.
textContent = textContent.replaceAll("’", "'");

// Normalize non-breaking whitespace to regular whitespace.
textContent = textContent.replaceAll("\xA0", " ");

// Docs sometimes appends a space to a link even if not in the source text. Strip it
textContent = textContent.replaceAll(/ +(\]\([^)\n]+\)) */g, "$1 ");

// Trim trailing whitespace.
textContent = textContent.replaceAll(/ +$/gm, "");

// Remove consecutive line breaks to at most one empty line.
// May happen if header is followed by enumeration.
textContent = textContent.replace(/(\n\n)\n+/g, "$1")

// Each section header has two blank lines in front of it.
textContent = textContent.replace(/^(?=#+ )/gm, "\n");

// Trim leading whitespace.
textContent = textContent.trim();

return textContent;
}

function replaceAllInTextNodes(root, pattern, replacement) {
let treeWalker = document.createTreeWalker(root, NodeFilter.SHOW_TEXT);
let updatesNodes = [];
for (let node = treeWalker.nextNode(); node; node = treeWalker.nextNode()) {
let orig = node.nodeValue;
let proposed;
let origParts = orig.split("`");
if (origParts.length && (origParts.length % 2)) {
// Contains an even number of `; skip over code blocks.
proposed = origParts.map((str, i) => {
if (i % 2) {
// Outside backtick.
return str;
}
return str.replaceAll(pattern, replacement);
}).join("`");
} else {
proposed = orig.replaceAll(pattern, replacement);
}
if (orig !== proposed) {
updatesNodes.push([node, proposed]);
}
}
for (let [node, proposed] of updatesNodes) {
node.parentNode.replaceChild(document.createTextNode(proposed), node);
}
}
</script>
</body>
</html>