-
Notifications
You must be signed in to change notification settings - Fork 1
/
convert_dataset.py
119 lines (98 loc) · 4.02 KB
/
convert_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import os
import csv
import re
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
# Path to the main folder containing the subfolders
main_folder = 'main_folder'
# Initialize an empty list to hold file names and BIRADS scores
data = []
# Function to normalize BIRADS score
def normalize_birads_score(score):
# Remove non-digit characters except for the digit itself
match = re.search(r'(\d)', score)
return match.group(1) if match else None
# Function to extract BIRADS score from text
def extract_birads_score(text):
lines = text.split('\n')
for line in lines:
if 'BIRADS' in line or 'SONUÇ' in line:
normalized_score = normalize_birads_score(line)
if normalized_score:
return normalized_score
return None
# Function to remove the BIRADS score section from text
def remove_birads_score(text):
lines = text.split('\n')
cleaned_lines = []
for line in lines:
# Check if the line contains the BIRADS score indication
if not ('BIRADS' in line or 'SONUÇ' in line):
cleaned_lines.append(line)
return '\n'.join(cleaned_lines)
# Traverse the main folder and its subfolders
subfolders = [f.path for f in os.scandir(main_folder) if f.is_dir()]
for subfolder in subfolders:
for root, dirs, files in os.walk(subfolder):
for file in files:
if file.endswith('.txt'):
file_path = os.path.join(root, file)
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
birads_score = extract_birads_score(content)
if birads_score:
data.append([file, birads_score])
# Remove BIRADS score section
cleaned_content = remove_birads_score(content)
# Write the cleaned content back to the file
with open(file_path, 'w', encoding='utf-8') as f:
f.write(cleaned_content)
# Define the path to save the CSV file
csv_file_path = 'output2.csv'
# Save the data to a CSV file
with open(csv_file_path, 'w', newline='', encoding='utf-8') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(['File Name', 'BIRADS Score'])
writer.writerows(data)
print(f"CSV file has been created successfully at {csv_file_path}")
print("BIRADS score sections have been removed from all text files.")
# Read the existing CSV file
csv_data = []
with open(csv_file_path, 'r', newline='', encoding='utf-8') as csvfile:
reader = csv.reader(csvfile)
headers = next(reader)
csv_data = list(reader)
# Add a new header for the Content column
headers.append('Content')
# Dictionary to map file names to their contents
file_content_map = {}
# Traverse the main folder and its subfolders again to get content
for subfolder in subfolders:
for root, dirs, files in os.walk(subfolder):
for file in files:
if file.endswith('.txt'):
file_path = os.path.join(root, file)
with open(file_path, 'r', encoding='utf-8') as f:
file_content = f.read()
file_content_map[file] = file_content
# Update the CSV data with file contents
for row in csv_data:
file_name = row[0]
if file_name in file_content_map:
row.append(file_content_map[file_name])
else:
row.append('')
# Write the updated data back to a new CSV file
new_csv_file_path = 'database.csv'
with open(new_csv_file_path, 'w', newline='', encoding='utf-8') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(headers)
writer.writerows(csv_data)
print(f"CSV file has been updated successfully at {new_csv_file_path}")
# Load the new CSV into a DataFrame and shuffle it
df = pd.read_csv(new_csv_file_path)
shuffled_df = df.sample(frac=1, random_state=42).reset_index(drop=True)
# Save to a CSV file
shuffled_df.to_csv('main_dataframe.csv', index=False)
print("DataFrame has been shuffled and saved as main_dataframe.csv")