-
Notifications
You must be signed in to change notification settings - Fork 0
/
ExtractEnFromPDF.py
63 lines (47 loc) · 1.9 KB
/
ExtractEnFromPDF.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import pdfplumber
import nltk
import os
import re
from dotenv import load_dotenv
from nltk.tokenize import sent_tokenize
from langdetect import detect
load_dotenv('.env')
# Define file path
file_path = os.environ['INPUT_EN_FILE_PATH']
# Function to check if a sentence contains English words
def is_english(sentence):
words = nltk.word_tokenize(sentence)
english_words = [word for word in words if word.isalpha()]
return len(english_words) / len(words) > 0.5
def extract_text_from_pdf(file_path):
with pdfplumber.open(file_path) as pdf:
text = ""
for page in pdf.pages:
text += page.extract_text()
return text
def save_text_to_file(text, output_file_path):
with open(output_file_path, "w", encoding="utf-8") as file:
file.write(text)
def get_sentences_segments(extracted_text):
# Remove line breaks
extracted_text = extracted_text.replace('\n', ' ').replace('\r', '')
extracted_text = re.sub('[“”]', '"', extracted_text)
# Segmentation of sentences
sentences = sent_tokenize(extracted_text)
english_sentences = [sentence for sentence in sentences if is_english(sentence) and detect(sentence) == 'en']
return english_sentences
def save_segments_to_file(english_sentences):
# for sentence in english_sentences:
# print(sentence+'\n')
# Provide the path to the output text file
output_file_path = os.environ["OUT_EN_FILE_PATH"]
with open(output_file_path, 'w',encoding="utf-8") as file:
for sentence in english_sentences:
file.write(sentence + "\n\n")
# Print the extracted result
print("Text extracted from PDF and saved to", output_file_path)
print("Count Sentences", len(english_sentences))
textFromPdf = extract_text_from_pdf(file_path)
save_text_to_file(textFromPdf,os.environ["OUT_EN_FILE_PATH"])
# segments = get_sentences_segments(textFromPdf)
# save_segments_to_file(segments)