-
Notifications
You must be signed in to change notification settings - Fork 0
/
Version2.py
164 lines (140 loc) · 9.25 KB
/
Version2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
import os
import pdfplumber
import csv
import difflib
def convert_bytes_to_human_readable(size_in_bytes):
for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
if size_in_bytes < 1024.0:
break
size_in_bytes /= 1024.0
return f"{size_in_bytes:.2f} {unit}"
def compare_pdfs(source_folder, target_folder, csv_file, log_file):
# Prepare CSV file for writing
with open(csv_file, 'w', newline='') as csv_out:
fieldnames = ['Source_FileName', 'Traget file name', 'Source_FileSize', 'Target_FileSize',
'Source_PageCount', 'Target_PageCount', 'Identical', 'Comments']
csv_writer = csv.DictWriter(csv_out, fieldnames=fieldnames)
csv_writer.writeheader()
# Read CSV file
with open("file_list.csv", 'r') as csv_file:
reader = csv.DictReader(csv_file)
for row in reader:
source_filename = row.get("Source_FileName", "")
target_filename = row.get("Traget file name", "")
source_path = os.path.join(source_folder, source_filename + ".pdf")
target_path = os.path.join(target_folder, target_filename + ".pdf")
print(f"Processing files: {source_path}, {target_path}")
if not os.path.isfile(source_path) or not os.path.isfile(target_path):
print(f"Files {source_filename} or {target_filename} do not exist.")
csv_writer.writerow({'Source_FileName': source_filename,
'Traget file name': target_filename,
'Source_FileSize': '-',
'Target_FileSize': '-',
'Source_PageCount': '-',
'Target_PageCount': '-',
'Identical': 'Non Identical',
'Comments': 'Files do not exist'})
continue
source_size = os.path.getsize(source_path)
target_size = os.path.getsize(target_path)
if source_size == target_size:
source_page_count = get_page_count(source_path)
target_page_count = get_page_count(target_path)
if source_page_count == target_page_count:
source_content = extract_content_from_pdf(source_path)
target_content = extract_content_from_pdf(target_path)
match_percentage = calculate_content_match_percentage(source_content, target_content)
with open(log_file, 'a', encoding='utf-8') as log:
log.write(f"Processing files: {source_path}, {target_path}\n")
log.write(f"Source and target file sizes match ({convert_bytes_to_human_readable(source_size)}).\n")
log.write(f"Source and target page counts match ({source_page_count} pages).\n")
if source_content == target_content:
log.write(f"Files {source_filename} and {target_filename} are identical.\n")
identical = 'Identical'
comments = 'File compared successfully'
else:
differences = get_content_differences(source_content, target_content)
log.write(f"Files {source_filename} and {target_filename} are different.\n")
log.write(f"Changes:\n")
log.write(differences)
log.write(f"Content match percentage: {match_percentage:.2f}%\n")
identical = 'Non Identical'
comments = 'Content does not match'
log.write("\n")
print(f"Files {source_filename} and {target_filename} processed.")
print(f"Source and target file sizes match ({convert_bytes_to_human_readable(source_size)}).")
print(f"Source and target page counts match ({source_page_count} pages).")
if source_content == target_content:
print(f"Files {source_filename} and {target_filename} are identical.")
else:
print(f"Files {source_filename} and {target_filename} are different.")
print(f"Content match percentage: {match_percentage:.2f}%")
csv_writer.writerow({'Source_FileName': source_filename,
'Traget file name': target_filename,
'Source_FileSize': convert_bytes_to_human_readable(source_size),
'Target_FileSize': convert_bytes_to_human_readable(target_size),
'Source_PageCount': source_page_count,
'Target_PageCount': target_page_count,
'Identical': identical,
'Comments': comments})
else:
with open(log_file, 'a', encoding='utf-8') as log:
log.write(f"Processing files: {source_path}, {target_path}\n")
log.write(f"Source and target file sizes match ({convert_bytes_to_human_readable(source_size)}).\n")
log.write(f"Source and target page counts do not match.\n")
log.write(f"Target file does not match with the source file.\n\n")
print(f"Processing files: {source_path}, {target_path}")
print(f"Source and target file sizes match ({convert_bytes_to_human_readable(source_size)}).")
print(f"Source and target page counts do not match.")
print(f"Target file does not match with the source file.")
csv_writer.writerow({'Source_FileName': source_filename,
'Traget file name': target_filename,
'Source_FileSize': convert_bytes_to_human_readable(source_size),
'Target_FileSize': convert_bytes_to_human_readable(target_size),
'Source_PageCount': source_page_count,
'Target_PageCount': target_page_count,
'Identical': 'Non Identical',
'Comments': 'Page count does not match'})
else:
with open(log_file, 'a', encoding='utf-8') as log:
log.write(f"Processing files: {source_path}, {target_path}\n")
log.write(f"Source and target file sizes do not match.\n")
log.write(f"Target file does not match with the source file.\n\n")
print(f"Processing files: {source_path}, {target_path}")
print(f"Source and target file sizes do not match.")
print(f"Target file does not match with the source file.")
csv_writer.writerow({'Source_FileName': source_filename,
'Traget file name': target_filename,
'Source_FileSize': convert_bytes_to_human_readable(source_size),
'Target_FileSize': convert_bytes_to_human_readable(target_size),
'Source_PageCount': '-',
'Target_PageCount': '-',
'Identical': 'Non Identical',
'Comments': 'File size does not match'})
def get_page_count(pdf_path):
with pdfplumber.open(pdf_path) as pdf:
page_count = len(pdf.pages)
return page_count
def extract_content_from_pdf(pdf_path):
with pdfplumber.open(pdf_path) as pdf:
text = ""
for page in pdf.pages:
text += page.extract_text()
# You can include other content extraction logic here (e.g., images, values)
return text
def get_content_differences(source_content, target_content):
d = difflib.Differ()
diff = list(d.compare(source_content.splitlines(), target_content.splitlines()))
return '\n'.join(diff)
def calculate_content_match_percentage(source_content, target_content):
match_percentage = difflib.SequenceMatcher(None, source_content, target_content).ratio() * 100
return match_percentage
if __name__ == "__main__":
source_folder = "source/"
target_folder = "target/"
csv_file = "comparison_results.csv"
log_file = "comparison_log.txt"
# Clear log file and CSV file before running the comparison
open(log_file, 'w', encoding='utf-8').close()
open(csv_file, 'w', newline='', encoding='utf-8').close()
compare_pdfs(source_folder, target_folder, csv_file, log_file)