-
Notifications
You must be signed in to change notification settings - Fork 0
/
ctb2stb.py
executable file
·940 lines (910 loc) · 50.7 KB
/
ctb2stb.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
'''
Author: hzs
Date: 2021-12-16 08:38:33
LastEditors: hzs
LastEditTime: 2022-06-14 10:57:55
Description: 拷贝jbw_new_add_jbwtag.py 规则列表删除了att对应的ADJP 添加<jbw> <ju id=''>标签
添加了状语adv的其他规则
重新定义小句
'''
import argparse
from tkinter import N
#from tree import Tree
from xml.etree import ElementTree as etree
from xml.dom import minidom
from xml.etree.ElementTree import Element,SubElement,ElementTree
import re
class Node(object):
"""
构造节点
"""
def __init__(self,val=None,children=None,father = None,tag=None,rel=None):
self.val = val
self.children = []
self.father = father
self.tag = tag
# rel记录特殊标记 如同位语
self.rel = rel
class N_Tree(object):
def __init__(self):
"""初始化一个空的二叉树"""
self.root = None
self.node_dict = {}
def build(self,node_list):
final = node_list[-1]
num = final.count('| ')
tag = final.strip().rsplit('| ',1)[-1]
# 处理终端节点
if tag.endswith('_terminal'): #endswith()函数用于判断字符串是否以指定后缀结尾,是则返回true:str.endswith(suffix[, start[, end]])
tag = tag.rsplit('_',1)[0] #rsplit() 方法从右侧开始将字符串拆分为列表。
max_len = len(node_list)
if (tag,max_len) not in self.node_dict:
de = Node(tag)
else:
de = self.node_dict[(tag,max_len)]
if final.endswith('_terminal'):
de.children = None
node_list.pop()
node_list_len = len(node_list)
for index,node in enumerate(node_list[::-1]):
curr_len = node_list_len - index
curr_num = node.count('| ')
if curr_num == num - 1:
parent_tag = node.strip().rsplit('| ',1)[-1]
if (parent_tag,curr_len) not in self.node_dict:
parent_node = Node(parent_tag)
self.node_dict[(parent_tag,curr_len)] = parent_node
self.node_dict[(parent_tag,curr_len)].children.insert(0,de)
de.father = self.node_dict[(parent_tag,curr_len)]
break
if len(node_list) == 1 and node_list[0].startswith('TOP'):
return
else:
self.build(node_list)
# 层序遍历
def levelOrder(self,root):
res = []
cur_queue = []
cur_queue.append(root)
if not root:
return res
while len(cur_queue):
sub = []
for i in range(len(cur_queue)):
current = cur_queue.pop(0)
if current.val == 'NP-SBJ':
root = Element( 'xj' )
sbj = SubElement(root, 'sbj' )
sub.append(current.val)
if current.children != None:
for j in current.children:
cur_queue.append(j)
res.append(sub)
return res
def find_parent(self,node):
return node.father
# 寻找tag为1的父节点
def find_father(self,node):
node_father = node.father
while node_father.tag != 1:
node_father = node_father.father
return node_father
# 处理小句导致的乱序情况
def find_have_bro_node(self,node):
while node.val !='TOP' and node.father and node.father.children[-1] == node:
node = node.father
if node.val == 'TOP':
return node.children[0]
else:
return node.father
# 寻找最顶层的TOP节点
# 先序遍历
def preorder(self,root):
xml_dict = {}
if not root:
return []
q = [root]
node_res = []
sent_res = []
num = 0
while q:
# 弹出列表尾部的一个元素
node = q.pop()
num+=1
node_res.append(node)
if node.children == None:
word = node.val.rsplit('_',1)[1]
sent_res.append(word)
# 逆序加入,从右到左
if node.children != None:
for child in node.children[::-1]:
q.append(child)
return node_res,sent_res
# 读取转换规则
def read_rules(self,file_path):
rules_dict = {}
with open(file_path,'r',encoding = 'utf-8') as rf:
for line in rf:
line = line.strip()
line_list = line.split()
key = line_list[:-1]
value = line_list[-1]
for i in key:
rules_dict[i.strip()] = value
return rules_dict
# 读取词性规则
def read_pos(self,file_path):
pos_dict = {}
with open(file_path,'r',encoding = 'utf-8') as rf:
for line in rf:
line = line.strip()
line_list = line.split()
value = line_list[-1]
for i in line_list[:-1]:
pos_dict[i] = value
return pos_dict
def process_val(self,node):
val = node.val
# 找标点符号的邻居叶子节点
def find_left_word(self,node):
while node.children != None and node.children[-1] != None:
node = node.children[-1]
return node
def find_left_final_node(self,node):
while node.children != None and node.children[0] != None:
node = node.children[0]
return node
# 处理标点符号,让他接在相邻的单词的后面
def process_pu(self,node):
if len(node.father.children) > 0:
for index,cur_node in enumerate(node.father.children): #列出元素和下标
if cur_node == node:
left_index = index - 1
if left_index >= 0:
left_node = node.father.children[left_index]
return left_node
else:
return 1
# 规则转换
def rule_conversion(self,node_res,sent_res,ju_index):
rules_dict = self.read_rules('./dict/ctb_to_jbw_new3.dict')
pos_dict = self.read_pos('./dict/ctb_to_jbw_pos.dict')
xml_dict = {}
for node in node_res:
node.val = node.val.lstrip('(') #lstrip截掉字符串左边的空格或指定字符
# 解决标点、 xml右侧条件 cc_COO 标签
if node.rel == 'cc_coo':
finded_node = self.find_father(node)
father_xml = xml_dict[finded_node]
cc = SubElement(father_xml, 'cc')
cc.set('fun','COO')
if node.rel == 'APP':
finded_node = self.find_father(node)
father_xml = xml_dict[finded_node]
cc = SubElement(father_xml, 'cc' )
cc.set('fun','APP')
if node.rel == 'uni':
finded_node = self.find_father(node)
father_xml = xml_dict[finded_node]
cc = SubElement(father_xml, 'cc')
cc.set('fun','UNI')
if node.rel == 'pvt':
finded_node = self.find_father(node)
father_xml = xml_dict[finded_node]
cc = SubElement(father_xml, 'cc')
cc.set('fun','PVT')
#同位语
if node.val in rules_dict and rules_dict[node.val] == 'app':
for index,i in enumerate(node.father.children):
if i == node:
bro_index = index
# 同位右侧可能没有节点 为了测试先注释这一行
if bro_index+1 <= len(node.father.children) -1:
node.father.children[bro_index+1].rel = 'APP'
######################################################################################3
if node.rel == 'cmp':#解决补语 新建一个节点值为cmp 当前节点的父亲节点为该节点
# node.tag = 1
# 找到当前结点的父结点
finded_node = self.find_father(node)
#获取到父结点的xml:father_xml
father_xml = xml_dict[finded_node]
#在其父结点下通过SubElement()函数创建cmp结点
xml_new_tag = SubElement(father_xml, 'cmp')
#将创建的cmp结点的tag设置为1,以便后续查询
tmp_node = Node('cmp')
tmp_node.tag = 1
#把当前结点赋给node.father
node.father = tmp_node
#把新加的cmp结点的加入xml_dict字典
xml_dict[tmp_node] = xml_new_tag
# 设置其左孩子
left_node = None
left_node_word = None
left_node_pos = None
right_bro_nodes_list = []
# 找当前节点的左孩子节点
if node.father:
left_node = None
# 找到当前节点的索引
if len(node.father.children) > 1:
for index,no in enumerate(node.father.children):
tmp_index = index
if tmp_index -1 >= 0 and tmp_index -1 < len(node.father.children):
left_node = node.father.children[tmp_index - 1]
if not left_node.children:
left_node_word = left_node.val.strip().split('_',1)[1].strip()
left_node_pos = left_node.val.strip().split('_',1)[0].strip()
if tmp_index -2 >= 0:
left_left_node = node.father.children[tmp_index - 2]
right_bro_nodes_list = node.father.children[tmp_index:]
if left_node and not left_node.children and (left_node_word==',' or left_node_word==';' or left_node_word==':') and left_node_pos=='PU':
finded_node = self.find_father(left_left_node)
father_xml = xml_dict[finded_node]
# 将该节点以及右侧所有的兄弟节点的 父节点 改为新建的xj节点
if left_left_node in xml_dict:
if xml_dict[left_left_node].tag != 'sbj' and xml_dict[left_left_node].tag != 'adv' and xml_dict[left_left_node].tag != 'att':
ju = xml_dict['ju']
xj = SubElement(ju,'xj') #SubElement()函数有两个对象,父结点下一个新的结点
# 新建一个xj节点
new_node = Node('xj')
new_node_list = []
for i in right_bro_nodes_list:
new_node_list.append(i)
father = node.father
while father.father:
if father.father.children[-1] != father:
cur_index = father.father.children.index(father)
for i in father.father.children[cur_index+1:]:
new_node_list.append(i)
father = father.father
new_node.tag = 1
xml_dict[new_node] = xj
else:
if father_xml.tag != 'sbj' and father_xml.tag !='adv' and father_xml.tag != 'att':
ju = xml_dict['ju']
xj = SubElement(ju,'xj') #SubElement()函数有两个对象,父结点下一个新的结点
# 新建一个xj节点
new_node = Node('xj')
new_node_list = []
for i in right_bro_nodes_list:
new_node_list.append(i)
father = node.father
while father.father:
if father.father.children[-1] != father:
cur_index = father.father.children.index(father)
for i in father.father.children[cur_index+1:]:
new_node_list.append(i)
father = father.father
for i in new_node_list:
i.father = new_node
new_node.children = new_node_list
new_node.tag = 1
xml_dict[new_node] = xj
if node.val == 'TOP':
root = Element( 'jbw' )
ju = SubElement(root,'ju')
ju.set('txt',''.join(sent_res))
ju.set('id',ju_index)
# 存储ju
xml_dict['ju'] = ju
xj = SubElement(ju,'xj')
xml_dict[node] = xj
node.tag = 1
# 叶子节点
elif node.children == None:
#处理兼语 不管是不是NONE 都要最后再进行一次判断 添加兼语
if node.rel == 'pvt':
finded_node = self.find_father(node)
father_xml = xml_dict[finded_node]
cc = SubElement(father_xml, 'cc')
cc.set('fun','PVT')
if node.val.strip().rsplit('_',1)[0]!='-NONE-':
finded_node = self.find_father(node)
father_xml = xml_dict[finded_node]
word = node.val.strip().split('_',1)[1].strip()
pos_tag = node.val.strip().split('_',1)[0].strip()
if pos_tag == 'VA':
for index,no in enumerate(node.father.father.children):
if no == node.father:
tmp_index = index
left_index = tmp_index - 1
left_node = node.father.father.children[left_index]
if left_node.children and left_node.children[0] and left_node.children[0].val.strip().rsplit('_',1)[0] != '-NONE-':
# 原来这句话是注释的 下面一行是打开的
finded_node = self.find_father(node)
father_xml = xml_dict[finded_node]
obj = SubElement(father_xml, 'prd')
cv_pos = pos_dict[pos_tag]
pos = SubElement(obj, cv_pos)
pos.text = word
else:
finded_node = self.find_father(node)
father_xml = xml_dict[finded_node]
cv_pos = pos_dict[pos_tag]
pos = SubElement(father_xml, cv_pos)
pos.text = word
# 原来if改成elif
elif pos_tag == 'VC' or pos_tag == 'VE' or pos_tag == 'VV':
prd_tag = False
if node.father.val != 'VSB':
finded_node = self.find_father(node)
father_xml = xml_dict[finded_node]
obj = SubElement(father_xml, 'prd')
# 新建prd标签 让VV的父节点为prd
cv_pos = pos_dict[pos_tag]
pos = SubElement(obj, cv_pos)
pos.text = word
# 处理prd上的属性
if len(node.father.children) > 1:
for index,cur_node in enumerate(node.father.children):
if cur_node == node:
tmp_index = index
tmp_list = []
for i in node.father.children[tmp_index:]:
if i.val in rules_dict:
if rules_dict[i.val]=='obj' or rules_dict[i.val] =='cmp':
tmp_list.append(rules_dict[i.val])
if i.rel == 'cmp':
tmp_list.append('cmp')
total_rel = ''.join([j[0] for j in tmp_list])
if total_rel == 'o':
obj.set('scp','VO')
elif total_rel == 'oo':
obj.set('scp','VOO')
elif total_rel == 'c':
obj.set('scp','VC')
elif total_rel == 'co':
obj.set('scp','VCO')
elif total_rel == 'oc':
obj.set('scp','VOC')
else:
obj.set('scp','V')
if len(node.father.children) == 1:
obj.set('scp','V')
# 修改地方
prd_tag = True
else:
finded_node = self.find_father(node)
father_xml = xml_dict[finded_node]
cv_pos = pos_dict[pos_tag]
pos = SubElement(father_xml, cv_pos)
pos.text = word
# 趋向合成
if node.val.strip().split('_',1)[0].strip() == 'VV' or node.val.strip().split('_',1)[0].strip() == 'VC':
# SYN 修改规则 VV父结点是VP VV右兄弟结点是VP此VP的所有孩子结点中有VV
father_node = node.father
if node.father.val == 'VP' and len(node.father.children) == 2 and node.father.children[1].val == 'VP':
node_res,sent_res = self.preorder(node.father.children[1])
childs_val = [i.val if i.children else i.val.rsplit('_')[0] for i in node_res]
if 'VV' in childs_val:
finded_node = self.find_father(node)
father_xml = xml_dict[finded_node]
cc = SubElement(father_xml, 'cc')
cc.set('fun','SYN')
# 能愿
if node.val == 'VP':
if len(node.father.children) > 1:
for index,no in enumerate(node.father.children):
if no == node:
tmp_index = index
if node.father.val == 'VP':
# bro_val 是 叶子节点 且词性是VC
bro_node = node.father.children[tmp_index-1]
if not bro_node.children:
pos,word = bro_node.val.split('_')
if pos == 'VV' or pos == 'VC':
if len(node.children) >= 1:
finded_node = self.find_father(node)
father_xml = xml_dict[finded_node]
cc = SubElement(father_xml, 'cc')
cc.set('fun','SYN')
# 处理兼语句
if pos_tag == 'VE':
if len(node.father.children) > 1:
for index,cur_node in enumerate(node.father.children):
if cur_node == node:
tem_index = index
if (tem_index + 1) < len(node.father.children):
rig_bro_node = node.father.children[tem_index + 1]
if rig_bro_node.val in rules_dict and rules_dict[rig_bro_node.val] == 'obj':
if rig_bro_node.children[0].val in rules_dict and rules_dict[rig_bro_node.children[0].val] == 'sbj':
if len(rig_bro_node.children) > 1:
finded_node = self.find_father(rig_bro_node)
if rig_bro_node.children[1].children != None:
rig_bro_node.children[1].father = finded_node
rig_bro_node.children[1].rel = 'pvt'
else:
if len(rig_bro_node.children) > 2:
if rig_bro_node.children[2].children != None:
rig_bro_node.children[2].father = finded_node
rig_bro_node.children[2].rel = 'pvt'
if pos_tag == 'VV':
if len(node.father.children) > 2:
for index,cur_node in enumerate(node.father.children):
if cur_node == node:
tem_index = index
if (tem_index + 1) < len(node.father.children) and (tem_index + 2) < len(node.father.children):
first_rig_bro_node = node.father.children[tem_index + 1]
sec_rig_bro_node = node.father.children[tem_index + 2]
if first_rig_bro_node.val in rules_dict and rules_dict[first_rig_bro_node.val] == 'obj':
if sec_rig_bro_node.children != None:
child = sec_rig_bro_node.children[0]
if child.val in rules_dict and rules_dict[child.val] == 'sbj':
sec_rig_bro_node.rel = 'pvt'
if prd_tag:
new_prd = Node('prd')
node.father = new_prd
xml_dict[new_prd] = obj
new_prd.tag = 1
elif pos_tag == 'P':#介词位
finded_node = self.find_father(node)
father_xml = xml_dict[finded_node]
pp = SubElement(father_xml, 'pp')
cv_pos = pos_dict[pos_tag]
pos = SubElement(pp, cv_pos)
pos.text = word
elif pos_tag == 'BA' or pos_tag == 'LB':
finded_node = self.find_father(node)
father_xml = xml_dict[finded_node]
if len(node.father.children) > 1:
for i in node.father.children:
if i != node:
bro_node = i
if re.search(r'.P-OBJ',bro_node.val) or re.search(r'.P-.P-OBJ',bro_node.val):
if bro_node.children[0].val in rules_dict:
if rules_dict[bro_node.children[0].val] == 'sbj':
adv = SubElement(father_xml, 'adv')
pp = SubElement(adv, 'pp')
cv_pos = pos_dict[pos_tag]
pos = SubElement(pp, cv_pos)
pos.text = word
tmp_node = Node('adv')
tmp_node.tag = 1
bro_node.children[0].father = tmp_node
xml_dict[tmp_node] = adv
else:
pp = SubElement(father_xml, 'pp')
cv_pos = pos_dict[pos_tag]
pos = SubElement(pp, cv_pos)
pos.text = word
# 对于不符合条件的LB之前会漏掉
else:
pp = SubElement(father_xml, 'pp')
cv_pos = pos_dict[pos_tag]
pos = SubElement(pp, cv_pos)
pos.text = word
else:
pp = SubElement(father_xml, 'pp')
cv_pos = pos_dict[pos_tag]
pos = SubElement(pp, cv_pos)
pos.text = word
elif pos_tag == 'LC':#方位词
finded_node = self.find_father(node)
father_xml = xml_dict[finded_node]
ff = SubElement(father_xml, 'ff')
# 加入xml_dict 为了标点符号考虑
xml_dict[node] = ff
cv_pos = pos_dict[pos_tag]
pos = SubElement(ff, cv_pos)
pos.text = word
# 助位词
elif pos_tag == 'DEG' or pos_tag == 'DER' or pos_tag == 'DEV' or pos_tag == 'DEC':
finded_node = self.find_father(node)
father_xml = xml_dict[finded_node]
uu = SubElement(father_xml, 'uu')
# 加入xml_dict 为了标点符号考虑
xml_dict[node] = uu
cv_pos = pos_dict[pos_tag]
pos = SubElement(uu, cv_pos)
pos.text = word
# DER右侧节点为补语 之前代码少了一层if判断
if pos_tag == 'DER':
for index,i in enumerate(node.father.children):
if i == node:
if index < len(node.father.children)-1:
node.father.children[index+1].rel = 'cmp'
break
elif pos_tag == 'ETC':
finded_node = self.find_father(node)
father_xml = xml_dict[finded_node]
uu = SubElement(father_xml, 'un')
# 加入xml_dict 为了标点符号考虑
xml_dict[node] = uu
cv_pos = pos_dict[pos_tag]
pos = SubElement(uu, cv_pos)
pos.text = word
elif pos_tag == 'SP' or pos_tag == 'AS' or pos_tag == 'SB' or pos_tag == 'MSP':
finded_node = self.find_father(node)
father_xml = xml_dict[finded_node]
uu = SubElement(father_xml, 'uv')
# 加入xml_dict 为了标点符号考虑
xml_dict[node] = uu
cv_pos = pos_dict[pos_tag]
pos = SubElement(uu, cv_pos)
pos.text = word
# 连词位
# elif pos_tag == 'CC' or pos_tag == 'CS':
elif pos_tag == 'CS' or pos_tag == 'CC' or pos_tag == 'ADVP-PRP':
if pos_tag == 'CC':
childs_val = [childs.val if childs.children else childs.val.split('_')[0] for childs in node.father.children]
cur_index = node.father.children.index(node)
if cur_index + 1 < len(node.father.children) and cur_index - 1 > -2:
right_node = node.father.children[cur_index+1]
left_node = node.father.children[cur_index-1]
finded_node = self.find_father(node)
father_xml = xml_dict[finded_node]
if cur_index == 0:
finded_node = self.find_father(node)
father_xml = xml_dict[finded_node]
cv_pos = pos_dict[pos_tag]
cc = SubElement(father_xml, 'cc')
pos = SubElement(cc, cv_pos)
pos.text = word
else:
if node.father.val == 'VP' and left_node.val == 'VP' and right_node.val == 'VP':
# right_node.rel = 'uni'
cc = SubElement(father_xml, 'cc')
cc.set('fun','UNI')
elif node.father.val == 'IP' and left_node.val == 'IP' and right_node.val == 'IP':
# right_node.rel = 'uni'
cc = SubElement(father_xml, 'cc')
cc.set('fun','UNI')
else:
# right_node.rel = 'cc_coo'
cc = SubElement(father_xml, 'cc')
cc.set('fun','COO')
cv_pos = pos_dict[pos_tag]
pos = SubElement(cc, cv_pos)
pos.text = word
else:
finded_node = self.find_father(node)
father_xml = xml_dict[finded_node]
cv_pos = pos_dict[pos_tag]
pos = SubElement(father_xml, cv_pos)
pos.text = word
elif pos_tag == 'CS' or pos_tag == 'ADVP-PRP':
cc = SubElement(father_xml, 'cc')
cv_pos = pos_dict[pos_tag]
pos = SubElement(cc, cv_pos)
pos.text = word
else:
finded_node = self.find_father(node)
father_xml = xml_dict[finded_node]
cv_pos = pos_dict[pos_tag]
pos = SubElement(father_xml, cv_pos)
pos.text = word
# 独立语
elif pos_tag == 'IJ' or pos_tag == 'PRN' or pos_tag == 'NP-PN-SBJ-VOC' or pos_tag == 'INTJ-IJ':
finded_node = self.find_father(node)
father_xml = xml_dict[finded_node]
uu = SubElement(father_xml, 'ind')
# 加入xml_dict 为了标点符号考虑
xml_dict[node] = uu
cv_pos = pos_dict[pos_tag]
pos = SubElement(uu, cv_pos)
pos.text = word
elif pos_tag == 'PU':
left_node = self.process_pu(node)
if left_node != 1:
# 添加PU、 兄弟节点有CC 则在标点后加 空 cc_COO
if word == '、':
childs_val = [childs.val if childs.children else childs.val.split('_')[0] for childs in node.father.children]
cur_index = node.father.children.index(node)
if cur_index + 1 < len(node.father.children) and cur_index - 1 > -1:
right_node = node.father.children[cur_index+1]
left_node = node.father.children[cur_index-1]
if 'CC' in childs_val:
right_node.rel = 'cc_coo'
if node.father.val == 'VP' and left_node.val == 'VP' and right_node.val == 'VP':
right_node.rel = 'uni'
if node.father.val == 'IP' and left_node.val == 'IP' and right_node.val == 'IP':
right_node.rel = 'uni'
adjoin_node = self.find_left_word(left_node)
if adjoin_node in xml_dict:
father_xml = xml_dict[adjoin_node]
else:
finded_node = self.find_father(adjoin_node)
father_xml = xml_dict[finded_node]
cv_pos = pos_dict[pos_tag]
pos = SubElement(father_xml, cv_pos)
pos.text = word
else:
cv_pos = pos_dict[pos_tag]
pos = SubElement(father_xml, cv_pos)
pos.text = word
else:
if pos_tag== '-NONE-':
pass
else:
cv_pos = pos_dict[pos_tag]
pos = SubElement(father_xml, cv_pos)
pos.text = word
# 如果叶子节点为-NONE-
else:
if node.father.val in rules_dict and (rules_dict[node.father.val] == 'sbj' or rules_dict[node.father.val] == 'obj'):
finded_node = self.find_father(node)
father_xml = xml_dict[finded_node]
xml_new_tag = SubElement(father_xml, 'x')
xml_dict[node] = xml_new_tag
# 如果节点的值不在规则列表中
elif node.val not in rules_dict:
if node.val == 'VSB' or node.val == 'VRD':
node.tag = 1
finded_node = self.find_father(node)
father_xml = xml_dict[finded_node]
xml_new_tag = SubElement(father_xml, 'prd')
xml_dict[node] = xml_new_tag
# pass
if node.val == 'QP' or node.val == 'DP':
if node.father.val.startswith('NP-'):
node.tag = 1
finded_node = self.find_father(node)
father_xml = xml_dict[finded_node]
xml_new_tag = SubElement(father_xml, 'att')
xml_dict[node] = xml_new_tag
# 定语规则 右边是NP-SBJ或ADVP或NP-TMP的NP-LOC-PN
if node.val == 'NP-LOC-PN':
# 找到当前节点的索引
if len(node.father.children) > 1:
for index,no in enumerate(node.father.children):
if no == node:
tmp_index = index
for i in node.father.children[tmp_index:]:
if i.val == 'NP-SBJ' or i.val == 'ADVP' or i.val == 'NP-TMP':
finded_node = self.find_father(node)
father_xml = xml_dict[finded_node]
cmp_tmp = SubElement(father_xml, 'att')
xml_dict[node] = cmp_tmp
node.tag = 1
# 右边有兄弟节点的CP
if node.val == 'CP':
# 找到当前节点的索引
if len(node.father.children) > 1:
for index,no in enumerate(node.father.children):
if no == node:
tmp_index = index
if tmp_index +1 <= len(node.father.children) - 1:
finded_node = self.find_father(node)
father_xml = xml_dict[finded_node]
cmp_tmp = SubElement(father_xml, 'att')
xml_dict[node] = cmp_tmp
node.tag = 1
# 主语 右边是VP的NP-LOC-PN
if node.val == 'NP-LOC-PN':
# 取到当前结点的索引
if len(node.father.children) > 1:
for index,no in enumerate(node.father.children):
if no == node:
tmp_index = index
for i in node.father.children[tmp_index:]:
if i.val == 'VP':
#找到当前结点的父结点
fined_node = self.find_father(node)
#取到父结点的xml
father_xml = xml_dict[fined_node]
#在父结点下创建obj结点
cmp_tmp = SubElement(father_xml, 'sbj')
#把新加的结点放进xml字典中:
xml_dict[node] = cmp_tmp
node.tag = 1
# 主语 左边无VV兄弟结点的LCP-LOC
if node.val == 'LCP-LOC':
for index,no in enumerate(node.father.children):
if no == node:
tmp_index = index
for i in node.father.children[:tmp_index]:
if i.val != 'VV':
fined_node = self.find_father(node)
father_xml = xml_dict[fined_node]
cmp_tmp = SubElement(father_xml, 'sbj')
xml_dict[node] = cmp_tmp
node.tag =1
# 主语 无ADVP子节点的...
if node.val == 'NP-FOC' or node.val == 'NP-FOC-12' or node.val == 'NP-FOC-15' or node.val == 'NP-FOC-2' or node.val == 'NP-FOC-3' or node.val == 'NP-FOC-4' or node.val == 'NP-FOC-5' or node.val == 'NP-FOC-9':
# 先把当前结点的子节点进行编码:
for index,no in enumerate(node.children):
# 如果没有子节点,则定义为sbj
if index == 0:
fined_node = self.find_father(node)
father_xml = xml_dict[fined_node]
cmp_tmp = SubElement(father_xml, 'sbj')
xml_dict[node]=cmp_tmp
node.tag = 1
# 无ADVP子节点:
elif no.val != 'ADVP':
fined_node = self.find_father(node)
father_xml = xml_dict[fined_node]
cmp_tmp = SubElement(father_xml, 'sbj')
xml_dict[node]=cmp_tmp
node.tag = 1
# 宾语
#左边有VV兄弟节点的LCP-LOC
if node.val =='LCP-LOC':
if len(node.father.children)>1:
for index,no in enumerate(node.father.children):
if no == node:
tmp_index = index
for i in node.father.children[:tmp_index]:
if i.val == 'VV':
fined_node = self.find_father(node)
father_xml = xml_dict[finded_node]
cmp_tmp = SubElement(father_xml, 'obj')
xml_dict[node] = cmp_tmp
node.tag = 1
#左侧有VV、VC、VE的QP-EXT
if node.val =='QP-EXT':
if len(node.father.children)>1:
for index,no in enumerate(node.father.children):
if no == node:
tmp_index = index
if tmp_index - 1 >= 0:
left_value = node.father.children[tmp_index - 1].val
if left_value == 'VV' or left_value == 'VE' or left_value == 'VC':
finded_node = self.find_father(node)
father_xml = xml_dict[finded_node]
node.tag = 1
obj = SubElement(father_xml, 'obj')
xml_dict[node] = obj
else:
finded_node = self.find_father(node)
father_xml = xml_dict[finded_node]
node.tag = 1
obj = SubElement(father_xml, 'cmp')
xml_dict[node] = obj
left_bro = node.father.children[tmp_index-1]
if node.val == 'VCD' and len(node.children) == 2: #联合
node.children[1].rel = 'uni'
# 根据PP确定补语和状语
if node.val == 'PP' and node.children[0].val.split('_')[0] != '-NONE-':
if len(node.father.children) > 1:
for index,no in enumerate(node.father.children):
if no == node:
tmp_index = index
# 右兄弟节点有VP的PP 不一定相邻
for bro_index in range(tmp_index,len(node.father.children)):
if node.father.children[bro_index].val == 'VP':
finded_node = self.find_father(node)
father_xml = xml_dict[finded_node]
cmp_tmp = SubElement(father_xml, 'adv')
xml_dict[node] = cmp_tmp
node.tag = 1
# 状语 右兄弟节点有VP的PP-LOC、PP-LOC-2、PP-LOC-WH
if node.val == 'PP-LOC' or node.val == 'PP-LOC-2' or node.val == 'PP-LOC-WH' and node.children[0].val.split('_')[0] != '-NONE-':
if len(node.father.children) > 1:
for index,no in enumerate(node.father.children):
if no == node:
tmp_index = index
# 右兄弟节点有VP 不一定相邻
for i in node.father.children[tmp_index:]:
if i.val == 'VP':
finded_node = self.find_father(node)
father_xml = xml_dict[finded_node]
cmp_tmp = SubElement(father_xml, 'adv')
xml_dict[node] = cmp_tmp
node.tag = 1
# 这些节点的子节点有ADVP是状语
if node.val == 'NP-FOC' or node.val == 'NP-FOC-12' or node.val == 'NP-FOC-15' or node.val == 'NP-FOC-2' or node.val == 'NP-FOC-3' or node.val == 'NP-FOC-4' or node.val == 'NP-FOC-5' or node.val == 'NP-FOC-9':
advp_tag = False
for index,no in enumerate(node.children):
if no.val == 'ADVP':
advp_tag = True
if advp_tag:
finded_node = self.find_father(node)
father_xml = xml_dict[finded_node]
cmp_tmp = SubElement(father_xml, 'adv')
xml_dict[node] = cmp_tmp
node.tag = 1
# 处理连动 VP节点的左侧和父亲节点都是VP
if node.val == 'VP' and node.father.val == 'VP':
if len(node.father.children) > 1:
for index,cur_node in enumerate(node.father.children):
if cur_node == node:
tmp_index = index
if tmp_index-1 >= 0 and node.father.children[tmp_index-1].val == 'VP':
finded_node = self.find_father(node)
father_xml = xml_dict[finded_node]
cc = SubElement(father_xml, 'cc')
cc.set('fun','SER')
if node.val == 'PP' or node.val == 'PP-LOC':
if len(node.father.children) > 1:
for index,cur_node in enumerate(node.father.children):
if cur_node == node:
tma_index = index
if tma_index - 1 >= 0:
left_value = node.father.children[tma_index - 1].val.split('_')[0]
if left_value == 'VV':
finded_node = self.find_father(node)
father_xml = xml_dict[finded_node]
node.tag = 1
#修正 此处应该是补语cmp:左兄弟节点有VV的PP、PP-LOC
cmp = SubElement(father_xml, 'cmp')
xml_dict[node] = cmp
else:
if len(node.children) == 1 and node.children[0].children == None and node.children[0].val.strip().rsplit('_',1)[0]=='-NONE-':
if rules_dict[node.val] == 'sbj' or rules_dict[node.val] == 'obj':
finded_node = self.find_father(node)
father_xml = xml_dict[finded_node]
node.tag = 1
xml_tag = rules_dict[node.val]
xml_new_tag = SubElement(father_xml, xml_tag )
xml_dict[node] = xml_new_tag
else:
# 处理prd缺省
if node.val == 'QP-PRD' or node.val == 'NP-PRD' or node.val == 'PP-PRD':
if len(node.father.children) > 1:
for index,j in enumerate(node.father.children):
if j == node:
left_index = index-1
left_node = node.father.children[left_index]
if left_node.val != 'VC':
finded_node = self.find_father(node)
father_xml = xml_dict[finded_node]
prd_tmp = SubElement(father_xml, 'prd')
xml_new_tag = SubElement(prd_tmp, 'x')
if node.val == 'NP-OBJ':
if len(node.father.children) > 2:
for index,no in enumerate(node.father.children):
if no == node:
left_index = index-1
right_index=index+1
if left_index >= 0 and right_index <= len(node.father.children)-1:
left_node = node.father.children[left_index]
right_node = node.father.children[right_index]
if left_node.val == 'VV' and right_node.val == 'IP' and node.father.val == 'VP':
finded_node = self.find_father(node)
father_xml = xml_dict[finded_node]
cc = SubElement(father_xml, 'cc')
cc.set('fun','PVT')
# 解决重复定语情况 之前无if判断
if (node.val == 'ADJP' and node.father.val == 'DNP') or rules_dict[node.val]=='app':
pass
elif len(node.children) == 1 and node.children[0].children!=None and len(node.children[0].children) == 1 and node.children[0].children[0].children == None and node.children[0].children[0].val.strip().rsplit('_',1)[0]=='-NONE-':
pass
elif node.val == 'IP-OBJ':
if len(node.father.children) > 1:
for index,no in enumerate(node.father.children):
if no == node:
left_index = index-1
if left_index >= 0:
left_node = node.father.children[left_index]
if left_node.val.split('_')[0] == 'PU':
pass
else:
finded_node = self.find_father(node)
father_xml = xml_dict[finded_node]
xml_tag = rules_dict[node.val]
xml_new_tag = SubElement(father_xml, xml_tag )
xml_dict[node] = xml_new_tag
node.tag = 1
else:
finded_node = self.find_father(node)
father_xml = xml_dict[finded_node]
xml_tag = rules_dict[node.val]
xml_new_tag = SubElement(father_xml, xml_tag )
xml_dict[node] = xml_new_tag
node.tag = 1
xml_string = etree.tostring(root)
tree = minidom.parseString(xml_string)
return tree.toprettyxml()
def read_input(input_file):
with open(input_file,'r',encoding = 'utf-8') as rf:
content_list = rf.read().strip().split('\n\n')
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--i')
parser.add_argument('--inputfile')
parser.add_argument('--outputdir')
args = parser.parse_args()
rf = open(args.inputfile, "r", encoding='utf-8')
sent_list = rf.read().strip().split('\n\n')
for index, sent in enumerate(sent_list):
tree = N_Tree()
node_list = sent.split('\n')
tree.build(node_list)
tree.root = tree.node_dict[('TOP',1)]
node_res, sent_res = tree.preorder(tree.root)
index = str(index+1).zfill(5)
jbw_xml = tree.rule_conversion(node_res, sent_res, index)
save_file = str(args.outputdir) + index + '.xml'
with open(save_file, 'w', encoding='utf-8') as wf:
wf.write(jbw_xml)