forked from williambrach/llm-plagiarism-check
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdataloader.py
161 lines (135 loc) · 5.57 KB
/
dataloader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
# imports
import os
import re
import pandas as pd
def read_file_content(file_path: str) -> str:
"""
Read and return the content of a file, stripping any leading/trailing whitespace.
:param file_path: Path to the file to be read
:return: Content of the file as a string
"""
with open(file_path) as file:
return file.read().strip()
def get_case_number(case_name: str) -> int:
"""
Extract the case number from a case name, removing any leading zeros.
:param case_name: Name of the case (e.g., "case-01")
:return: Case number as an integer
"""
return int(re.search(r"\d+", case_name).group().replace("0", ""))
def process_non_plagiarized_files(
case_path: str, original_content: str, case_number: int
) -> list:
"""
Process non-plagiarized files for a given case.
:param case_path: Path to the case folder
:param original_content: Content of the original file
:param case_number: Number of the current case
:return: List of dictionaries containing information about non-plagiarized files
"""
non_plagiarized_path = os.path.join(case_path, "non-plagiarized")
rows = []
# Get folders, excluding hidden folders and those containing "01", "02", or "03"
folders = [
f
for f in os.listdir(non_plagiarized_path)
if not f.startswith(".") and not any(x in f for x in ["01", "02", "03"])
]
for folder in folders:
folder_path = os.path.join(non_plagiarized_path, folder)
java_files = [f for f in os.listdir(folder_path) if f.endswith(".java")]
for file in java_files:
file_content = read_file_content(os.path.join(folder_path, file))
rows.append(
{
"L": 0, # Level 0 for non-plagiarized files
"case": case_number,
"sample_1": original_content,
"sample_2": file_content,
"plagiarized": False,
"reason": None,
}
)
return rows
def process_plagiarized_files(
case_path: str, original_content: str, case_number: int
) -> list:
"""
Process plagiarized files for a given case.
:param case_path: Path to the case folder
:param original_content: Content of the original file
:param case_number: Number of the current case
:return: List of dictionaries containing information about plagiarized files
"""
plagiarized_path = os.path.join(case_path, "plagiarized")
rows = []
# Get plagiarized folders, excluding hidden folders
plagiarized_folders = [
f for f in os.listdir(plagiarized_path) if not f.startswith(".")
]
for p_folder in plagiarized_folders:
p_folder_path = os.path.join(plagiarized_path, p_folder)
# Get level folders, excluding hidden folders and "01"
level_folders = [f for f in os.listdir(p_folder_path) if not f.startswith(".")]
for level_folder in level_folders:
level_folder_path = os.path.join(p_folder_path, level_folder)
java_files = [
f for f in os.listdir(level_folder_path) if f.endswith(".java")
]
for file in java_files:
if "L1/01/" in os.path.join(level_folder_path, file):
continue
file_content = read_file_content(os.path.join(level_folder_path, file))
rows.append(
{
"L": int(p_folder.replace("L", "")), # Extract level number
"case": case_number,
"sample_1": original_content,
"sample_2": file_content,
"plagiarized": True,
"reason": None,
}
)
return rows
def build_eval_dataset(data_path: str) -> pd.DataFrame:
"""
Build the evaluation dataset by processing all cases in the given data path.
:param data_path: Path to the directory containing all case folders
:return: Pandas DataFrame containing the evaluation dataset
"""
rows = []
# Get all case folders
case_folders = [folder for folder in os.listdir(data_path) if "case-" in folder]
for case in case_folders:
case_path = os.path.join(data_path, case)
case_number = get_case_number(case)
# Find and read the original file
original_file = [
f
for f in os.listdir(os.path.join(case_path, "original"))
if f.endswith(".java")
][0]
original_content = read_file_content(
os.path.join(case_path, "original", original_file)
)
# Process non-plagiarized and plagiarized files
rows.extend(
process_non_plagiarized_files(case_path, original_content, case_number)
)
rows.extend(process_plagiarized_files(case_path, original_content, case_number))
return pd.DataFrame(rows)
def check_if_data_folder_exits(data_folder: str) -> None:
# Check if the folder exists
if not os.path.exists(data_folder):
print(f"The folder '{data_folder}' does not exist in the repository.")
print(
"Please download it from: https://github.com/oscarkarnalim/sourcecodeplagiarismdataset/blob/master/IR-Plag-Dataset.zip"
)
print(
"After downloading, extract the contents to the 'data' folder in your repository."
)
raise Exception(f"The folder '{data_folder}' does not exist.")
else:
print(
f"The folder '{data_folder}' exists. You can proceed with loading the data."
)