-
Notifications
You must be signed in to change notification settings - Fork 0
/
NewExtracterCn.py
55 lines (40 loc) · 1.63 KB
/
NewExtracterCn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
from pdfminer.high_level import extract_text
import os
from dotenv import load_dotenv
import jieba
jieba.initialize()
load_dotenv('.env')
# Define file path
cn_file_path = os.environ['INPUT_CN_FILE_PATH']
# Function to check if a sentence contains Chinese characters
def is_chinese(sentence):
# Check if the sentences are chinese
words = jieba.lcut(sentence)
chinese_words = [word for word in words if word.isalpha()]
return len(chinese_words) / len(words) > 0.5
def extract_text_from_pdf(file_path):
# Extract as binary text from PDF
text = extract_text(file_path)
return text
def save_text_to_file(text, output_file_path):
# Save text to file as normal
text = text.replace('\n', ' ').replace('\r', '')
with open(output_file_path, "w", encoding="utf-8") as file:
file.write(text)
def get_sentences_segments(extracted_text):
# Split sentences as segments
sentences = extracted_text.split('\n\n')
return sentences
def save_segments_to_file(chinese_sentences):
# Provide the path to the output text file
output_file_path = os.environ["OUT_CN_FILE_PATH"]
with open(output_file_path, 'w', encoding='utf-8') as file:
for sentence in chinese_sentences:
sentence = sentence.replace('\n', ' ').replace('\r', '') # Remove Unnecessary linebreak
file.write(sentence + '\n\n')
# Print the extracted result
print("Text extracted from PDF and saved to", output_file_path)
print("Count Sentences", len(chinese_sentences))
textFromPdf = extract_text_from_pdf(cn_file_path)
segments = get_sentences_segments(textFromPdf)
save_segments_to_file(segments)