-
Notifications
You must be signed in to change notification settings - Fork 2
/
aozora-parser.pegjs
356 lines (270 loc) · 7.94 KB
/
aozora-parser.pegjs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
/*
* 青空文庫注記の解析表現文法
*
* https://github.com/kawabata/aozora-proc/blob/master/grammar.txt
* https://gist.github.com/takahashim/5b049a305128dcd12245
*/
/*
* 青空文庫の「文字」の表記法
*/
Start
= Block+
String
= Char+
Char
= $(!"\n" !"[#" !"※[#" !( "〔" LatinChar ) !"《" !"》" !"|" .)
/ KanjiGaiji
/ NonKanjiGaiji
/ Kanbun
Kanji "漢字"
= [\u3400-\u9FCB\uF900-\uFAD9]
/ [仝〆○々]
/ KanjiGaiji
Kana "かな"
= [ぁ-んァ-ヶ゛-ゞ・-ヾ]
/ "/″\"
/ "/\"
KanjiGaiji "漢字外字" // 注記文字列をparseして第(3|4)水準漢字やUCSの情報を抽出する必要あり
= "※[#二の字点、1-2-22]"
/ "※[#「" AnnString "]"
/ "※[#二の字点、" AnnString "]"
/ "※[#濁点付き二の字点、" AnnString "]"
NonKanjiGaiji "非漢字外字" // 注記文字列をparseして第(3|4)水準漢字やUCSの情報を抽出する必要あり
= "※[#" !"「" AnnString "]"
Kanbun "漢文"
= KuntenOkuri? Kaeriten
/ KuntenOkuri
KuntenOkuri "訓点送り"
= "[#(" (Kanji / Kana)+ ")]"
Kaeriten "返り点"
= "[#" (KaeriJunjoen KaeriReten? / KaeriReten) "]"
KaeriJunjoen "返り順序点"
= [一二三四上中下天地人甲乙丙丁]
KaeriReten "返りレ点"
= "レ"
LatinChar "欧文字"
= [a-zA-Zα-ρσ-ωΑ-ΡΣ-ΩА-яЁё]
/*
* 青空文庫の文に対する「注記」の表記法
*/
GeneralString "一般文字列"
= (
((String / LatinString) QuoteAnn* GeneralRuby? QuoteAnn*)
/ (DefRuby QuoteAnn*)
)+
AnnString "注記文字列"
= $(!"]" Char)+
QuoteString "引用文字列"
= $(
(QuoteChar+ / LatinString) QuoteAnn* GeneralRuby? QuoteAnn*
/ DefRuby QuoteAnn*
)+
QuoteChar "引用文字"
= $(!"」は" !"」の" !"」に" !"」]" Char)
QuoteAnn "引用注記"
= ModifierAnn
/ OriginalAnn
/ TypistAnn
RubyAnn "ルビ注記"
= RubyModifierAnn
/ RubyOriginalAnn
/ RubyTypistAnn
ModifierAnn "修飾注記"
= "[#「" QuoteString "」" Modifier "]"
RubyModifierAnn "ルビ修飾注記"
= "[#ルビの「" QuoteString "」" Modifier "]"
OriginalAnn "原文注記"
= "[#「" QuoteString "」" "の左"? "に「" QuoteString "」の注記]"
RubyOriginalAnn "ルビ原文注記"
= "[#ルビの「" QuoteString "」" "の左"? "に「" QuoteString "」の注記]"
TypistAnn "入力者注記"
= "[#「" QuoteString "」は" TeihonAnn "]"
RubyTypistAnn "ルビ入力者注記"
= "[#ルビの「" QuoteString "」は" TeihonAnn "]"
TeihonAnn "底本注記"
= "底本では「" QuoteString "」"
/ "ママ"
Modifier "修飾指定"
= "に" Em
/ "の" LeftEm
/ "の" LeftRuby
/ "は" Jitai
/ "は" CharSize
Em "強調"
= "二重"? ("傍線" / "波線" / "破線" / "鎖線")
/ "傍点"
/ "白ゴマ傍点"
/ "丸傍点"
/ "白丸傍点"
/ "×傍点"
/ "黒三角傍点"
/ "白三角傍点"
/ "二重丸傍点"
/ "蛇の目傍点"
/ "白四角傍点"
LeftEm "左強調"
= "左に" Em
LeftRuby "左ルビ"
= "左に「" QuoteString "」のルビ"
Jitai "字体"
= Heading
/ KeiKakomi
/ "太字"
/ "斜体"
/ "分数"
/ "上付き小文字"
/ "下付き小文字"
/ "篆書体"
/ "小書き"
/ "行右小書き"
/ "行左小書き"
/ "横組み"
/ "縦中横"
/ "合字"
/ "ローマ数字"
Heading "見出し"
= ("窓" / "同行")? ("大" / "中" / "小") "見出し"
KeiKakomi "罫囲み"
= "二重"? "罫囲み"
CharSize "文字サイズ"
= Number "段階" (("大きな" / "小さな") "文字")
Number "数"
= [0-9]+
/ [0-9]+
/ [一二三四五六七八九十]
GeneralRuby "一般ルビ"
= GeneralRuby2 RubyAnn*
DefRuby "指定ルビ"
= DefRuby2 RubyAnn*
GeneralRuby2 "一般ルビ2"
= "《" $String "》"
DefRuby2 "指定ルビ2"
= "|" (String / LatinString) QuoteAnn* "《" String "》"
LatinString "欧文"
= "〔" LatinChar (LatinChar / [!-~] / QuoteAnn)+ "〕"
/*
* 青空文庫の「行」に対する注記法
*/
Line "行"
= (GeneralAnn / GeneralString)+
GeneralAnn "一般注記"
= KakomiAnn
/ Warichu
/ ChiyoseAnn
/ Figure
/ TeihonTypistAnn // 地上げ→地寄せ
KakomiAnn "囲み注記"
= "[#" ((Em / LeftEm / Jitai) / CharSize) "]"
GeneralString
"[#" (Em / LeftEm / Jitai / CharSizeEnd) "終わり]"
Warichu "割り注"
= "[#割り注]"
(Newline / GeneralString)+
"[#割り注終わり]"
Newline "改行"
= "[#改行]"
CharSizeEnd "文字サイズ終"
= "大きな文字"
/ "小さな文字"
ChiyoseAnn "地上げ注記"
= Chiyori
/ Chitsuki
/ Chiyose
Chiyori "地寄り"
= "[#下げて、地より" Number "字あきで]"
Chiyose "地上げ"
= "[#地から" Number "字上げ]"
Chitsuki "地付き"
= "[#地付き]"
Figure "図"
= "[#" FigureAnn ("(" / "(") FileName ".png" FigureSize? (")" / ")") "入る]"
FigureAnn "図注記"
= (!"(" !"(" Char)+
FileName "ファイル名"
= (!".png" Char)+
FigureSize "図大きさ"
= "、横" Number "×縦" Number
TeihonTypistAnn "底本入力者注記"
= "[#底本では" AnnString "]"
/*
* 青空文庫の段落ブロックに対する注記法
*/
Block "ブロック"
= (
PageDef
/ ParaIndent
/ ParaDef
/ Para
) "[#本文終わり]"?
Block2 "ブロック2"
= PageDef
/ ParaDef
/ Para
Block3 "ブロック3"
= ParaIndent
/ ParaDef
/ Para
Para "段落"
= Indent? Line? "\n"
Indent "字下げ"
= "[#" Number "字下げ]"
PageDef "ページ指定"
= Centering
/ ClearAnn
Centering "左右中央"
= "[#ページの左右中央]" "\n" Block3* "[#改ページ]" "\n"
ClearAnn "改まり注記"
= ("[#改丁]" / "[#改ページ]") "\n"
ParaDef "段落指定"
= (
ParaJizume
/ ParaChitsuki
/ ParaNegativeIndent
/ ClearColumn
/ ParaJitai
/ ParaLargeChar
/ ParaSmallChar
/ Column
) "\n"
ClearColumn "改段"
= "[#改段]"
ParaIndent "段落字下げ"
= ParaIndent2 + IndentEnd "\n"
ParaIndent2 "段落字下げ2"
= (
NewlineTentsuki
/ LeftIndent
/ IndentBegin
/ IndentBegin2
) "\n" Block2*
NewlineTentsuki "改行天付き"
= "[#ここから改行天付き、折り返して" Number "字下げ]"
LeftIndent "天字下げ"
= "[#天から" Number "字下げ]"
IndentBegin "文字下げ"
= "[#ここから" Number "字下げ]"
IndentBegin2 "文字下げ2"
= "[#ここから" Number "字下げ、折り返して" Number "字下げ]"
IndentEnd "字下げ終"
= "[#ここで字下げ終わり]"
ParaJizume "段落字詰め"
= "[#ここから" Number "字詰め]\n" Block* "[#ここで字詰め終わり]"
ParaChitsuki "段落地付き"
= "[#ここから地付き]\n" Block* "[#ここで地付き終わり]"
ParaNegativeIndent "段落字上げ"
= "[#ここから地から" Number "字上げ]\n" Block* "[#ここで字上げ終わり]"
ParaJitai "段落字体"
= "[#ここから" Jitai "]\n" Block* "[#ここで" Jitai "終わり]"
ParaLargeChar "段落文字大"
= "[#ここから" Number "段階大きな文字]\n" Block* "[#ここで大きな文字終わり]"
ParaSmallChar "段落文字小"
= "[#ここから" Number "段階小さな文字]\n" Block* "[#ここで小さな文字終わり]"
Column "段組み"
= "[#ここから" Number "段組み" "、段間に罫"? "]\n" Block* "[#ここで段組み終わり]"
/*
* 制約条件
*
* 「囲み注記」 の開始・終了注記の字体修飾指定は一致しなければなりません。
* 「修飾注記・原文注記等・原文注記・入力者注記・底本注記・左ルビ」における「引用文字列」は、
* その直前の本文文字列と、注記表記の有無のいずれかで一致しなければなりません。
*/