Skip to content

Commit

Permalink
Merge pull request #1 from vxern/document-private-api-members
Browse files Browse the repository at this point in the history
docs: Document API members, export (almost) everything, improve immutability.
  • Loading branch information
vxern authored Mar 2, 2024
2 parents a048716 + 022aa7e commit 646754d
Show file tree
Hide file tree
Showing 20 changed files with 344 additions and 144 deletions.
10 changes: 10 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,13 @@
## 0.2.0

- Document all API members, export them as well.
- Use better naming:
- Rename "lemma" to "dictionary entry".
- Rename "inflection table" to "inflection model".
- Rename "header" and "body" to "heading" and "table" for inflection models.
- Use "scrape" instead of "parse".
- Make returned objects read-only.

## 0.1.2

- Bumped dependency versions in `package.json`, added an explicit import for `ts-node`.
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
## A lightweight Dexonline.ro page scraper to fetch information about words in the Romanian language.
## A tiny, battle-tested, performant and documented scraper for dexonline.ro.

### Usage

Expand Down
3 changes: 3 additions & 0 deletions biome.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@
},
"suspicious": {
"noEmptyInterface": "off"
},
"complexity": {
"noExcessiveCognitiveComplexity": "off"
}
}
},
Expand Down
2 changes: 1 addition & 1 deletion jsr.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"name": "@vxern/dexonline-scraper",
"version": "0.1.2",
"version": "0.2.0",
"exports": "./src/index.ts"
}
4 changes: 2 additions & 2 deletions package.json
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
{
"name": "dexonline-scraper",
"description": "A lightweight Dexonline.ro page scraper to fetch information about words in the Romanian language.",
"description": "A tiny, battle-tested, performant and documented scraper for dexonline.ro.",
"license": "MIT",
"version": "0.1.2",
"version": "0.2.0",
"type": "module",
"main": "./dist/index.js",
"types": "./dist/index.d.ts",
Expand Down
11 changes: 9 additions & 2 deletions src/constants/copyright.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,11 @@
export default [
/**
* @remarks
* This is a list of dictionary identifiers that are under copyright, and cannot be queried without explicit permission.
*
* `dexonline-scraper` filters entries out from them by default, however this can be overriden in the case of
* having obtained explicit permission for a given dictionary.
*/
export default Object.freeze([
"Petro-Sedim",
"Legislație",
"DLR",
Expand Down Expand Up @@ -58,4 +65,4 @@ export default [
"DAN",
"Șăineanu, ed. I",
"DASLR",
];
] satisfies string[]);
4 changes: 2 additions & 2 deletions src/constants/expressions.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
export default {
export default Object.freeze({
treeType: /^type-(\w+)$/,
relationType: /^me-(\d+)$/,
tableLemmaWithIndex: /((?:[a-zA-ZăĂâÂîÎșȘțȚ-]+))(<sup>(\d+)<\/sup>)?/,
};
} as const satisfies Record<string, RegExp>);
4 changes: 2 additions & 2 deletions src/constants/links.ts
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
export default {
export default Object.freeze({
definition: (word: string): string => `https://dexonline.ro/definitie/${word}`,
};
} as const satisfies Record<string, unknown>);
6 changes: 3 additions & 3 deletions src/constants/selectors.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import { ContentTabs } from "../options.js";

export default {
export default Object.freeze({
contentTab: (tab: ContentTabs): string => `#tab_${tab}`,
contentTabs: {
synthesis: {
Expand Down Expand Up @@ -41,7 +41,7 @@ export default {
element: "div",
lemma: "span[class=lexemeName]",
// There is indeed a trailing whitespace here.
// Do not ask me why: I don't know.
// Do not ask me why for I do not know.
tag: 'span[class="tag "]',
},
body: {
Expand All @@ -51,4 +51,4 @@ export default {
},
},
},
};
} as const);
42 changes: 28 additions & 14 deletions src/index.ts
Original file line number Diff line number Diff line change
@@ -1,23 +1,33 @@
import * as cheerio from "cheerio";
import copyrightedDictionaries from "./constants/copyright.js";
import Expressions from "./constants/expressions.js";
import Links from "./constants/links.js";
import Selectors from "./constants/selectors.js";
import { DictionaryFlags, MatchingModes, ParserOptions, SearchOptionsWithWord } from "./options.js";
import * as Inflection from "./tabs/inflection.js";
import * as Synthesis from "./tabs/synthesis.js";

/** The default search options. */
const defaultSearchOptions: ParserOptions = {
const defaultSearchOptions = Object.freeze({
mode: "lax",
excludeCopyrighted: true,
flags: DictionaryFlags.None,
} as const;
} as const satisfies ParserOptions);

/** The default search options with a pre-filled value for the `word` property. */
const defaultSearchOptionsWithWord = Object.freeze({
...defaultSearchOptions,
word: "",
} as const satisfies SearchOptionsWithWord);

/** Represents the results of a word search using `dexonline-scraper`. */
export interface Results {
synthesis: Synthesis.Lemma[];
inflection: Inflection.InflectionTable[];
readonly synthesis: Synthesis.DictionaryEntry[];
readonly inflection: Inflection.InflectionModel[];
}

/**
* Taking a {@link word} and (optionally) a set of {@link ParseOptions}, searches
* Taking a {@link word} and (optionally) a set of {@link ParserOptions}, searches
* for the word on dexonline, returning a {@link Results} object or {@link undefined}
* if not found.
*
Expand Down Expand Up @@ -53,24 +63,28 @@ export async function get(
* @param options - Options for searching.
* @returns A {@link Results} object or {@link undefined} if unable to parse.
*/
export function parse(
contents: string,
options: SearchOptionsWithWord<true> = { ...defaultSearchOptions, word: "" },
): Results {
export function parse(contents: string, options: SearchOptionsWithWord<true> = defaultSearchOptionsWithWord): Results {
const $ = cheerio.load(contents);

const optionsFilled: SearchOptionsWithWord<false> = {
...defaultSearchOptions,
...options,
};

const synthesis = Synthesis.parse($, optionsFilled);
const inflection = Inflection.parse($, optionsFilled);
const synthesis = Synthesis.scrape($, optionsFilled);
const inflection = Inflection.scrape($, optionsFilled);

return { synthesis, inflection };
}

export * from "./tabs/inflection.js";
export * from "./tabs/synthesis.js";
export { DictionaryFlags, MatchingModes, Synthesis, Inflection, Links };
export {
DictionaryFlags,
MatchingModes,
Synthesis,
Inflection,
Links,
Expressions,
Selectors,
copyrightedDictionaries,
};
export type { ParserOptions };
26 changes: 13 additions & 13 deletions src/options.ts
Original file line number Diff line number Diff line change
Expand Up @@ -27,29 +27,29 @@ export interface ParserOptions {
*
* @defaultValue `"lax"`
*/
mode: MatchingModes;
readonly mode: MatchingModes;

/**
* Specifies whether the parser should exclude copyrighted dictionaries.
*
* @defaultValue `true`
*/
excludeCopyrighted: boolean;
readonly excludeCopyrighted: boolean;

/** Configures Dexonline's response. */
flags: DictionaryFlags;
readonly flags: DictionaryFlags;
}

export type SearchOptionsWithWord<IsPartial extends boolean = false> = (IsPartial extends true
? Partial<ParserOptions>
: ParserOptions) &
(
| { mode: "lax" }
| {
mode: "strict";
word: string;
}
);
export type SearchOptionsWithWord<IsPartial extends boolean = false> = Readonly<
(IsPartial extends true ? Partial<ParserOptions> : ParserOptions) &
(
| { mode: "lax"; word?: string }
| {
mode: "strict";
word: string;
}
)
>;

/**
* Bit-based flags for configuring the dictionary and the results sent back by it.
Expand Down
83 changes: 61 additions & 22 deletions src/tabs/inflection.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,44 +3,67 @@ import Expressions from "../constants/expressions.js";
import Selectors from "../constants/selectors.js";
import { ContentTabs, SearchOptionsWithWord } from "../options.js";

export interface InflectionTable extends Header, Body {}

interface Header {
tags: string[];
index: number;
lemma: string;
/** Represents an inflection model as scraped directly from the models on Dexonline. */
export interface InflectionModel extends Heading, Table {}

/** Represents the heading of a Dexonline inflection model. */
interface Heading {
/** The tags for a given inflection model. */
readonly tags: string[];
/** The index of the inflection model as shown on the webpage. */
readonly index: number;
/** The lemma this inflection model applies to */
readonly lemma: string;
}

interface Body {
table: string[][];
/** Represents the body (table) of a Dexonline inflection model. */
interface Table {
/** The HTML table represented as a 2D array of rows and columns. */
readonly table: string[][];
}

export function parse($: CheerioAPI, options: SearchOptionsWithWord): InflectionTable[] {
/**
* Given a {@link $|Cheerio document handle} and additional {@link options} for scraping entries, scrapes the inflection
* models on the page.
*
* @param $ - A Cheerio document handle for the webpage.
* @param options - Options for the scraper.
* @returns An array of the scraped {@link InflectionModel|inflection models}.
*/
export function scrape($: CheerioAPI, options: SearchOptionsWithWord): InflectionModel[] {
const inflection = $(Selectors.contentTab(ContentTabs.Inflection));

const entries = inflection.find(Selectors.contentTabs.inflection.entry.element).toArray();

const tables: InflectionTable[] = [];
const tables: InflectionModel[] = [];
for (const entry of entries) {
const tableElement = $(entry).children(Selectors.contentTabs.inflection.entry.table.element).first();

const header = parseHeader($, tableElement);
if (options.mode === "strict" && header.lemma !== options.word) {
const heading = scrapeHeading($, tableElement);
if (options.mode === "strict" && heading.lemma !== options.word) {
continue;
}

const body = parseBody($, tableElement);
const body = scrapeTable($, tableElement);
if (body.table.length === 0) {
continue;
}

tables.push({ ...header, ...body });
tables.push({ ...heading, ...body });
}
return tables;
}

function parseHeader($: CheerioAPI, header: Cheerio<Element>): Header {
const section = header.children(Selectors.contentTabs.inflection.entry.table.header.element);
/**
* Given a {@link $|Cheerio document handle} and the {@link $heading} for the inflection model on the webpage, scrapes
* its heading.
*
* @param $ - A Cheerio document handle for the webpage.
* @param $heading - A Cheerio document handle for the heading of the inflection model.
* @returns The scraped inflection model {@link Heading}.
*/
export function scrapeHeading($: CheerioAPI, $heading: Cheerio<Element>): Heading {
const section = $heading.children(Selectors.contentTabs.inflection.entry.table.header.element);

const lemmaString = section.children(Selectors.contentTabs.inflection.entry.table.header.lemma).html() ?? undefined;
if (lemmaString === undefined) {
Expand Down Expand Up @@ -69,8 +92,16 @@ function parseHeader($: CheerioAPI, header: Cheerio<Element>): Header {
return { tags, lemma, index };
}

function parseBody($: CheerioAPI, body: Cheerio<Element>): Body {
const section = body.children(Selectors.contentTabs.inflection.entry.table.body.element);
/**
* Given a {@link $|Cheerio document handle} and the {@link $body} for the inflection model on the webpage, scrapes
* its table.
*
* @param $ - A Cheerio document handle for the webpage.
* @param $body - A Cheerio document handle for the table of the inflection model.
* @returns The scraped inflection model {@link Table}.
*/
export function scrapeTable($: CheerioAPI, $body: Cheerio<Element>): Table {
const section = $body.children(Selectors.contentTabs.inflection.entry.table.body.element);

// Certain words are listed in the inflection tab but do not show up with a table.
if (section.length === 0) {
Expand Down Expand Up @@ -130,7 +161,7 @@ function parseBody($: CheerioAPI, body: Cheerio<Element>): Body {
const rowSpan = Number(column.attr("rowspan") ?? "1");
const columnSpan = Number(column.attr("colspan") ?? "1");

const text = getCellText($, column);
const text = scrapeCellContents($, column);

for (const _ of Array(columnSpan).keys()) {
const index = freeCells.shift() ?? extendedRows.length;
Expand All @@ -151,14 +182,22 @@ function parseBody($: CheerioAPI, body: Cheerio<Element>): Body {
return { table };
}

function getCellText($: CheerioAPI, cell: Cheerio<Element>): string {
const listElements = cell
/**
* Given a {@link $|Cheerio document handle} and the {@link $cell} inside the table for an inflection model, scrapes the
* contents of the cell.
*
* @param $ - A Cheerio document handle for the webpage.
* @param $cell - A Cheerio document handle for the cell element.
* @returns The scraped contents of the cell.
*/
export function scrapeCellContents($: CheerioAPI, $cell: Cheerio<Element>): string {
const listElements = $cell
.children("ul")
.children("li")
.map((_, element) => $(element));

if (listElements.length === 0) {
return cell.text().trim().replaceAll(/ +/g, " ");
return $cell.text().trim().replaceAll(/ +/g, " ");
}

const parts: string[] = [];
Expand Down
Loading

0 comments on commit 646754d

Please sign in to comment.