-
Notifications
You must be signed in to change notification settings - Fork 0
/
00_create_db.sql
121 lines (112 loc) · 4.99 KB
/
00_create_db.sql
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
-- # Database for the WordSet project
-- ## Languages
-- Languages are identified by their ISO lang code, and they have separate
-- r-trees, so the id of their root node is here as well.
DROP TABLE IF EXISTS lang_t;
CREATE TABLE lang_t (
id INTEGER PRIMARY KEY,
isocode TEXT, -- ISO 639-2/T three-letter code, 'in own language' i.e. 'deu' instead of 'ger'
rootnode INTEGER -- REFERENCES node_t
);
-- ## Classes
-- No need to store it in DB, won't ever look it up dynamically, and it'll never change.
--
-- enum word_classes_t {
-- NOUN = 0, // eg. home, happiness, chocolate
-- VERB = 1, // eg. walk, happen, be
-- ADJECTIVE = 2, // eg. big, sweet, bright
-- ADVERB = 3, // eg. very, quite, quickly, today
-- PRONOUN = 4, // eg. your, we, there, who, myself
-- PREPOSITION = 5, // eg. in, of, to, with
-- CONJUNCTION = 6, // eg. and, but, so, therefore
-- INTERJECTION = 7, // eg. oh, alas, oops
-- ARTICLE = 8, // eg. a, an, the
-- }
-- ## Concepts
-- Concepts are the meaning behind the words. A word may describe different concepts,
-- (eg. 'gas' may mean the state of substance, or the propellant of cars), and a
-- concept may be described by different words (eg. 'petrol' and 'gas' may mean the
-- same thing), so there is an N-to-N relationship between words and concepts.
-- Moreover, it sometimes depends on the language, like that languages of sea-faring
-- cultures have more concepts about parts of ships than of those in-land cultures.
DROP TABLE IF EXISTS concept_t;
CREATE TABLE concept_t (
id INTEGER PRIMARY KEY,
class INTEGER -- enum word_classes_t
);
DROP INDEX IF EXISTS concept_class_i;
CREATE INDEX concept_class_i ON concept_t (class);
-- ## Tags
-- Tags are arbitrary labels that represent thematic categories (eg. 'plants', 'food',
-- 'religion', 'moving'), cultural aspects (eg. 'slang', 'archaic', 'official'),
-- dialects (eg. 'us', 'brazil'), or anything else :)
DROP TABLE IF EXISTS tag_t;
CREATE TABLE tag_t (
id INTEGER PRIMARY KEY,
name TEXT, -- in English, fallback
concept INTEGER -- REFERENCES concept_t -- for translations
);
-- ## Words
-- More correctly, lemmata: canonical forms of word stems, without any conjugation
-- that is not required for the meaning (eg. 'get on' instead of 'getting on', but
-- not just 'get'). Also it is language-specific, because 'Bad' means a different
-- concept in English than in German :)
DROP TABLE IF EXISTS word_t;
CREATE TABLE word_t (
id INTEGER PRIMARY KEY,
lang INTEGER, -- REFERENCES lang_t
value TEXT
);
DROP INDEX IF EXISTS word_lang_i;
CREATE INDEX word_lang_i ON word_t (lang);
DROP INDEX IF EXISTS word_value_i;
CREATE INDEX word_value_i ON word_t (value);
-- ## Nodes
-- For looking up words based on similarity, we need an R-tree (for each
-- language) with a metric being a modified Levenshtein distance.
-- Each row of this table represents a node in such an R-tree, and has a
-- type (leaf or internal node), a boundary expression that covers all its
-- content ('abc|ef|apq|...', because it's easier to parse than '[abc][ef][apq]...'),
-- the number of the mandatory prefix (the rest is optional), and the id-s
-- of the child nodes/words as a blob of little-endian uint32_t-s.
-- (No search is needed by child nodes, so it spares us a connecting table,
-- and this is a very frequent query, so its performance *does* count.)
--
-- enum node_type_t {
-- LEAF = 0, // contains id-s of words
-- INTERNAL = 1, // contains id-s of other nodes
-- }
DROP TABLE IF EXISTS node_t;
CREATE TABLE node_t (
id INTEGER PRIMARY KEY,
ntype INTEGER, -- enum node_type_t
boundary TEXT,
mandatory_pfx INTEGER,
children BLOB
);
-- ## Connection table: Words-to-concepts
DROP TABLE IF EXISTS word_concept_t;
CREATE TABLE word_concept_t (
word INTEGER, -- REFERENCES word_t
concept INTEGER -- REFERENCES concept_t
);
DROP INDEX IF EXISTS w_c_word_i;
CREATE INDEX w_c_word_i ON word_concept_t (word);
DROP INDEX IF EXISTS w_c_concept_i;
CREATE INDEX w_c_concept_i ON word_concept_t (concept);
-- ## Connection table: Tags-to-concepts
DROP TABLE IF EXISTS tag_concept_t;
CREATE TABLE tag_concept_t (
tag INTEGER, -- REFERENCES tag_t
concept INTEGER -- REFERENCES concept_t
);
DROP INDEX IF EXISTS t_c_word_i;
CREATE INDEX t_c_word_i ON tag_concept_t (tag);
DROP INDEX IF EXISTS t_c_concept_i;
CREATE INDEX t_c_concept_i ON tag_concept_t (concept);
-- ## TODO: relation of concepts, like
-- noun-to-noun: generalisation-of, specialisation-of (eg. 'rose' is spec. of 'flower' is spec. of 'plant' is spec. of 'object')
-- adjective-to-noun: abstraction-of, state-of (eg. 'silence' is abstr. of 'silent', 'silentness' is state of 'silent')
-- adjective-to-adjective: opposite-of, higher-degree-of (eg. 'cold' is opp. of 'hot', 'hot' is higher deg. of 'warm')
-- verb-to-noun: result-of (eg. 'meal' is result of 'cook')
-- etc.