-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathinit_query.sql
133 lines (133 loc) · 5.08 KB
/
init_query.sql
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
SELECT
phone.cldf_id AS ID,
phone.cldf_name AS Value,
1000*phone.duration AS Duration,
word.cldf_languageReference AS Language,
language.family AS Family,
word.speaker_id AS Speaker,
CASE
WHEN phone.cldf_id in (select cldf_id FROM utterance_initials) THEN 1 ELSE 0
END utt_initial, -- whether or not the phone is in utterance initial position
CASE
WHEN phone.cldf_id in (select cldf_id FROM word_initials) THEN 1 ELSE 0
END word_initial, -- whether or not the phone is in word initial position
sound.cldf_cltsReference AS CLTS,
-- normalized word length:
ROUND(((phones_per_word.num_phones - sd_num_phones.avg_num_phones) / sd_num_phones.num_phones), 3) AS z_num_phones,
-- normalized speech rate of the utterance:
ROUND(((utt.log_speech_rate - sd_speech_rate.avg_speech_rate) / sd_speech_rate.speech_rate), 3) AS z_speech_rate,
-- normalized frequency of the word form:
ROUND(((forms.freq - sd_word_freq.avg_word_freq) / sd_word_freq.word_freq), 3) AS z_word_freq,
cluster.cluster_status
FROM
"phones.csv" AS phone,
"words.csv" AS word, -- word-level metadata joined ON phone.wd_id = word.cldf_id
ParameterTable AS sound, -- sound-level metadata joined ON phone.cldf_parameterReference = sound.cldf_id
LanguageTable AS language
LEFT JOIN
(
SELECT
-- Here we compute the threshold for exclusion of unusually long phones.
w.speaker_id, avg(p.duration) + 3 * stdev(p.duration) AS threshold
FROM
`phones.csv` AS p,
`words.csv` AS w
WHERE
p.cldf_parameterreference IS NOT NULL AND
p.wd_id = w.cldf_id
GROUP BY w.speaker_id -- Thresholds are computed per speaker.
) AS t
ON
word.speaker_id = t.speaker_id
LEFT JOIN
phones_per_word
ON
phone.wd_id = phones_per_word.wd_id
LEFT JOIN
forms
ON
word.cldf_name = forms.form AND
word.cldf_languageReference = forms.cldf_languageReference
LEFT JOIN
utterances AS utt -- utterance-level stats such as speech rate.
ON
phone.u_ID = utt.u_id
LEFT JOIN -- summary stats on word length per language
(
SELECT
stdev(p.num_phones) AS num_phones,
AVG(p.num_phones) AS avg_num_phones,
w.cldf_languageReference
FROM
phones_per_word as p
LEFT JOIN
'words.csv' AS w
ON
p.wd_id = w.cldf_id
GROUP BY
w.cldf_languageReference
) AS sd_num_phones
ON word.cldf_languageReference = sd_num_phones.cldf_languageReference
LEFT JOIN -- summary stats on speech rate per language
(
SELECT
stdev(log_speech_rate) AS speech_rate,
AVG(log_speech_rate) AS avg_speech_rate,
cldf_languageReference
FROM
utterances
GROUP BY
cldf_languageReference
) AS sd_speech_rate
ON word.cldf_languageReference = sd_speech_rate.cldf_languageReference
LEFT JOIN -- summary stats on word form frequency per language
(
SELECT
stdev(freq) AS word_freq,
AVG(freq) AS avg_word_freq,
cldf_languageReference
FROM
forms
GROUP BY
cldf_languageReference
) AS sd_word_freq
ON
word.cldf_languageReference = sd_word_freq.cldf_languageReference
LEFT JOIN
(
SELECT
pp.cldf_id,
pp.wd_id,
pp.cldf_name,
CASE
WHEN not previous_is_consonant and instr(cldf_cltsreference, 'consonant') > 0 and next_is_consonant and next_wd_id = wd_id THEN 'clusterInitial'
WHEN ((previous_is_consonant and previous_wd_id = wd_id) or (next_is_consonant and next_wd_id = wd_id)) and instr(cldf_cltsreference, 'consonant') > 0 THEN 'noInitial'
ELSE 'noCluster'
END cluster_status
FROM (
SELECT
p.*,
s.cldf_cltsreference,
lag(instr(s.cldf_cltsreference, 'consonant') > 0) over () as previous_is_consonant,
lag(p.wd_id) over () as previous_wd_id,
lead(instr(s.cldf_cltsreference, 'consonant') > 0) over () as next_is_consonant,
lead(p.wd_id) over () as next_wd_id
FROM `phones.csv` as p
LEFT OUTER JOIN `parametertable` as s on p.cldf_parameterReference = s.cldf_id) as pp
) AS cluster
ON
phone.cldf_id = cluster.cldf_id
WHERE
phone.wd_id = word.cldf_id AND
phone.cldf_parameterReference = sound.cldf_id AND
word.cldf_languageReference = language.cldf_id AND
-- We only consider non-long, pulmonic consonants ...
sound.cldf_cltsReference LIKE '%_consonant' AND
sound.cldf_cltsReference NOT LIKE '%long%' AND
-- ... and exclude utterance-initial stops.
NOT (phone.cldf_id in (select cldf_id from utterance_initials) AND sound.cldf_cltsReference LIKE '%stop%') AND
NOT (phone.cldf_id in (select cldf_id from utterance_initials) AND sound.cldf_cltsReference LIKE '%affricate%') AND
-- We also exclude phonemes with unusually long durations, which hint at annotation errors.
phone.duration < t.threshold AND
phone.duration > 0.03
;