-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbconn_ext.py
132 lines (102 loc) · 3.85 KB
/
bconn_ext.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
# -*- coding: utf-8 -*-
"""bconn_ext.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1B88-4BPQ32fj34ZZyg1l7u6wlYOSf9SX
"""
pip install sklearn-crfsuite
import xml.etree.ElementTree as ET
def parse_xml(file_path):
tree = ET.parse(file_path)
root = tree.getroot()
sentences = []
for doc in root.findall('.//document'):
for sent in doc.findall('.//sentence'):
sent_text = sent.get('text')
entities = {entity.get('id'): entity.get('text') for entity in sent.findall('.//entity')}
# Tokenize the sentence text here as per your requirement
tokens = sent_text.split() # This is a simplistic tokenizer
# Create a list of (word, label) tuples
labeled_tokens = []
for token in tokens:
label = 'O' # Default label
for entity_id, entity_text in entities.items():
if entity_text in token:
label = 'B-Individual_protein' # Replace with actual entity type
break
labeled_tokens.append((token, label))
sentences.append(labeled_tokens)
return sentences
file_path = '/content/WhiteTextNegFixFull.xml' # Replace with your XML file path
sentences = parse_xml(file_path)
import sklearn_crfsuite
from sklearn_crfsuite import metrics
def word2features(sent, i):
word = sent[i][0]
features = {
'bias': 1.0,
'word.lower()': word.lower(),
'word[-3:]': word[-3:],
'word[-2:]': word[-2:],
'word.isupper()': word.isupper(),
'word.istitle()': word.istitle(),
'word.isdigit()': word.isdigit(),
}
if i > 0:
word1 = sent[i-1][0]
features.update({
'-1:word.lower()': word1.lower(),
'-1:word.istitle()': word1.istitle(),
'-1:word.isupper()': word1.isupper(),
})
else:
features['BOS'] = True
if i < len(sent)-1:
word1 = sent[i+1][0]
features.update({
'+1:word.lower()': word1.lower(),
'+1:word.istitle()': word1.istitle(),
'+1:word.isupper()': word1.isupper(),
})
else:
features['EOS'] = True
return features
def sent2features(sent):
return [word2features(sent, i) for i in range(len(sent))]
def sent2labels(sent):
return [label for token, label in sent]
# Example: Load your data here
# Split data into training and test sets (customize this according to your data)
train_sents = sentences[:int(len(sentences) * 0.8)]
test_sents = sentences[int(len(sentences) * 0.8):]
X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]
X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]
# Train the CRF model
crf = sklearn_crfsuite.CRF(
algorithm='lbfgs',
c1=0.1,
c2=0.1,
max_iterations=100,
all_possible_transitions=True
)
crf.fit(X_train, y_train)
# Make predictions
y_pred = crf.predict(X_test)
# Evaluate the model
print(metrics.flat_accuracy_score(y_test, y_pred))
new_file_path = '/content/WhiteTextUnseenEval.xml'
new_sentences = parse_xml(new_file_path)
X_new = [sent2features(s) for s in new_sentences]
new_predictions = crf.predict(X_new)
# Iterate over all sentences and their predictions
for sentence, predicted_labels in zip(new_sentences, new_predictions):
for word, prediction in zip(sentence, predicted_labels):
print(f"{word[0]}: {prediction}")
print("\n--- End of Sentence ---\n")
with open("predicted_results.txt", "w") as file:
for sentence, predicted_labels in zip(new_sentences, new_predictions):
for word, prediction in zip(sentence, predicted_labels):
file.write(f"{word[0]}: {prediction}\n")
file.write("\n--- End of Sentence ---\n\n")