-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Data Preprocessing and Anomaly Detection
- Loading branch information
Showing
3 changed files
with
380 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,138 @@ | ||
source,title,price,location,odometer,image_count,anomaly_score,has_anomaly | ||
facebook,ram laramie x diesel,4999.0,"Houston, TX",100000.0,1,0.18596239000029452,0 | ||
facebook,chevrolet silverado crew cab rst pickup ft,52888.0,"Houston, TX",17000.0,1,0.10085177902743259,0 | ||
facebook,nissan versa sedan,7500.0,"Houston, TX",42000.0,1,0.16429336849520748,0 | ||
facebook,toyota runner trd premium sport utility,43602.0,"Houston, TX",19000.0,1,0.1245762809579582,0 | ||
facebook,ford f super duty crew cab platinum pickup ft,79992.0,"Houston, TX",25000.0,1,0.05489800948335205,0 | ||
facebook,ford mustang gt coupe,4999.0,"Spring, TX",9000.0,1,0.030995813296170405,1 | ||
facebook,kia k lxs sedan,20929.0,"Houston, TX",68000.0,1,0.15092247425696403,0 | ||
facebook,ford bronco black diamond sport utility,42995.0,"Houston, TX",13000.0,1,0.12393566265303096,0 | ||
facebook,ford expedition xlt sport utility,38995.0,"Houston, TX",65000.0,1,0.13088074579512832,0 | ||
craigslist,ford f king ranch x diesel nav roof new wheels tires,39995.0,WWW.GETADIESEL.COM,122000.0,24,0.14230162498520027,0 | ||
craigslist,ford f platinum x diesel nav sunroof toy tires,33885.0,WWW.GETADIESEL.COM,194000.0,24,0.13864289252206835,0 | ||
facebook,ford expedition max platinum sport utility,33739.0,"Houston, TX",100000.0,1,0.1446137154822576,0 | ||
craigslist,ram x cummins neckover flatbed new tires,38885.0,WWW.GETADIESEL.COM,156000.0,24,0.14113540010860937,0 | ||
facebook,ford explorer st sport utility,45788.0,"Houston, TX",21000.0,1,0.1218959689428955,0 | ||
craigslist,ram x cummins back cam b w hitch tx truck,32999.0,WWW.GETADIESEL.COM,209000.0,24,0.13630659675498263,0 | ||
facebook,lincoln aviator black label grand touring sport utility,85488.0,"Houston, TX",11000.0,1,0.042989215628509436,1 | ||
craigslist,bmw x,9500.0,Abilene,141000.0,10,0.06327754725917423,0 | ||
facebook,volkswagen atlas cross sport sel motion sport utility,35588.0,"Houston, TX",53000.0,1,0.13865588271484158,0 | ||
craigslist,ford lariat x v leather nav leveled new,27990.0,WWW.GETADIESEL.COM,109000.0,23,0.1191502343549033,0 | ||
facebook,ford f supercrew cab king ranch pickup ft,32998.0,"Houston, TX",90000.0,1,0.14349622056540895,0 | ||
craigslist,ford xl x diesel skirted cm flatbed new tires,49999.0,WWW.GETADIESEL.COM,71000.0,24,0.10649685605121312,0 | ||
facebook,ford maverick lariat pickup ft,36888.0,"Houston, TX",9000.0,1,0.12789629559974452,0 | ||
craigslist,ford super duty platinum crew cab,79900.0,Ford F-450 Super Duty,78000.0,24,-0.0178812849770843,1 | ||
facebook,toyota camry xse,2500.0,"Houston, TX",83000.0,1,0.1799660885783036,0 | ||
facebook,nissan sentra,8200.0,"Houston, TX",107000.0,1,0.18476581416772508,0 | ||
craigslist,peterbilt,42900.0,Peterbilt 579 w/sleeper,627000.0,22,-0.03071784696102875,1 | ||
facebook,ford f supercrew cab fx pickup ft,,"Houston, TX",118000.0,1,0.17950933276122344,0 | ||
craigslist,ram power wagon x hemi nav winch ram boxes new,35995.0,WWW.GETADIESEL.COM,107000.0,24,0.14183774540922023,0 | ||
facebook,dodge journey crossroad sport utility,1500.0,"Houston, TX",114000.0,1,0.17403495961802645,0 | ||
craigslist,ford f lariat x diesel leather nav roof new mt tires,28885.0,WWW.GETADIESEL.COM,202000.0,24,0.13989820895145788,0 | ||
craigslist,ram x cummins bfg tires priced trade value,24850.0,WWW.GETADIESEL.COM,196000.0,24,0.12870939627196742,0 | ||
facebook,buick enclave sport utility,3500.0,"Houston, TX",138000.0,1,0.1573383751826588,0 | ||
craigslist,ford king ranch x diesel nav roof gooseneck toyo ht tires,31800.0,WWW.GETADIESEL.COM,227000.0,24,0.1291563059292763,0 | ||
facebook,ford xlt pickup x,,"Houston, TX",113000.0,1,0.17926225200608426,0 | ||
craigslist,ford f supercrew platinum,40000.0,"Breckenridge, Texas",52000.0,14,-0.0027951370855203717,1 | ||
facebook,ram mega cab,2999.0,"Houston, TX",100000.0,1,0.18713626161795982,0 | ||
craigslist,hyundai n line night edition,1.0,Abilene,43000.0,8,0.03979466120983155,1 | ||
facebook,gmc sierra z x de enganche,4900.0,"Houston, TX",,1,0.1874700834395005,0 | ||
craigslist,ram laramie x cummins nav leveled new,29987.0,WWW.GETADIESEL.COM,200000.0,24,0.14122606551809191,0 | ||
facebook,ford f supercrew cab lariat pickup ft,13500.0,"Spring, TX",203000.0,1,0.030858986598606586,1 | ||
craigslist,chevy hd ltz x duramax skirted hydraulic hay bed,44995.0,WWW.GETADIESEL.COM,143000.0,24,0.13079924417412814,0 | ||
facebook,ram crew cab st pickup ft,4500.0,"Houston, TX",130000.0,1,0.17200986926549283,0 | ||
craigslist,gmc canyon elevated,45000.0,Abilene,37000.0,4,0.03465979717464063,1 | ||
facebook,ram,2999.0,"Houston, TX",100000.0,1,0.18713626161795982,0 | ||
craigslist,ford platinum x diesel lift new meyhems new,42995.0,WWW.GETADIESEL.COM,142000.0,24,0.13768723857478643,0 | ||
facebook,toyota corolla le plus sedan,9800.0,"Sugar Land, TX",75000.0,1,0.05611327436419844,0 | ||
craigslist,ford lariat x v auto nav black leather tx truck,27999.0,WWW.GETADIESEL.COM,109000.0,24,0.13904857016926075,0 | ||
facebook,toyota camry se sedan,2199.0,"Houston, TX",67000.0,1,0.1792899399858539,0 | ||
facebook,lincoln mkz sedan,6499.0,"Sugar Land, TX",127000.0,1,0.04385667950054006,1 | ||
craigslist,ford king ranch fx diesel nav bds leveling kit fox shocks,43850.0,WWW.GETADIESEL.COM,180000.0,24,0.12556633034364273,0 | ||
facebook,gmc sierra z slt x de enganche,7000.0,"Houston, TX",,1,0.1848175282650657,0 | ||
craigslist,ram x cummins back cam bfg tires gooseneck,48888.0,WWW.GETADIESEL.COM,131000.0,24,0.12066310860143203,0 | ||
facebook,gmc yukon,2999.0,"Houston, TX",100000.0,1,0.18713626161795982,0 | ||
craigslist,ram lonestar x cummins leveled new tires,49900.0,WWW.GETADIESEL.COM,126000.0,22,0.10459576629012846,0 | ||
facebook,chevrolet silverado z x de enganche,4400.0,"Houston, TX",,1,0.18714564823462826,0 | ||
craigslist,ram x hemi lift methods new,32990.0,WWW.GETADIESEL.COM,112000.0,18,0.09063296208902921,0 | ||
facebook,ford sport de enganche,5000.0,"Houston, TX",,1,0.18753136090512046,0 | ||
craigslist,ram laramie longhorn mega x cummins nav sunroof,29900.0,WWW.GETADIESEL.COM,228000.0,24,0.12913977538617455,0 | ||
facebook,chevrolet camaro rs,4000.0,"Pasadena, TX",130000.0,1,0.05475745759401013,0 | ||
craigslist,ram x cummins back cam nitto tires tx truck,28995.0,WWW.GETADIESEL.COM,215000.0,24,0.13808741550777864,0 | ||
facebook,ford lariat x de enganche,5500.0,"Houston, TX",,1,0.18761996593143898,0 | ||
craigslist,cadillac cts turbo automatic black leather well maintained,10900.0,WWW.GETADIESEL.COM,133000.0,24,0.10791024561852375,0 | ||
facebook,chevrolet silverado crew cab ls pickup ft,13000.0,"Cypress, TX",154000.0,1,0.02171750937448541,1 | ||
craigslist,mercedes benz suv,16900.0,Snyder,128000.0,12,0.026960617079861193,1 | ||
facebook,gmc sierra slt x de enganche,4400.0,"Houston, TX",,1,0.18714564823462826,0 | ||
craigslist,jeep patriot,7450.0,"Potosi, TX",104000.0,10,-0.011446317475731438,1 | ||
craigslist,wd laramie turbo ram,51000.0,Haskell,83000.0,7,0.04599964570521936,1 | ||
facebook,nissan altima,1900.0,"Houston, TX",36000.0,1,0.1707998571729935,0 | ||
craigslist,ford f fx crew cab great truck hail ding special,15995.0,Clyde,102000.0,24,0.02197260350359459,1 | ||
facebook,toyota corolla le special edition sedan,1599.0,"Houston, TX",108000.0,1,0.1817734713343247,0 | ||
craigslist,hyundai elantra,7000.0,ABILENE,151000.0,9,0.05029596196625413,0 | ||
facebook,dodge avenger sxt sedan,3100.0,"Houston, TX",121000.0,1,0.18009361046234534,0 | ||
craigslist,hyundai accent,7000.0,ABILENE,118000.0,7,0.05594094445557429,0 | ||
facebook,chevrolet silverado crew cab ltz pickup ft,2795.0,"Houston, TX",8000.0,1,0.14986769699766278,0 | ||
craigslist,chev impala,5000.0,ABILENE,251000.0,5,0.017526257955390256,1 | ||
facebook,lexus gs sedan,13500.0,"Houston, TX",105000.0,1,0.16806550282781013,0 | ||
craigslist,chevy,7500.0,ABILENE,122000.0,8,0.05477290460728734,0 | ||
facebook,toyota camry se nightshade edition sedan,1699.0,"Houston, TX",62000.0,1,0.1774492931727455,0 | ||
craigslist,toyota runner sr premium x v leather nav nitto,38985.0,WWW.GETADIESEL.COM,48000.0,23,0.10462856961402456,0 | ||
facebook,nissan altima sedan,8500.0,"Houston, TX",125000.0,1,0.1723584804683761,0 | ||
craigslist,wd laramie turbo ram,51000.0,Haskell,83000.0,7,0.04599964570521936,1 | ||
facebook,chevrolet silverado hd crew cab ltz x diesel sale trade,21500.0,"Houston, TX",210000.0,1,0.07556318897918501,0 | ||
craigslist,ram longhorn mega cab x cummins saddle leather nav,37995.0,WWW.GETADIESEL.COM,170000.0,24,0.1392140833500518,0 | ||
facebook,volkswagen jetta sedan,4200.0,"Spring, TX",160000.0,1,0.04167203165437211,1 | ||
craigslist,hyundai tuscon limited suv ready go,9895.0,Clyde,129000.0,20,0.04166519899940785,1 | ||
facebook,chevrolet suburban de enganche,4000.0,"Houston, TX",,1,0.18780700437853798,0 | ||
craigslist,ram x cummins auto cm skirted flatbed tx truck,23880.0,WWW.GETADIESEL.COM,198000.0,24,0.12496835375674564,0 | ||
facebook,chevrolet silverado lt de enganche,4500.0,"Houston, TX",,1,0.18714564823462826,0 | ||
craigslist,hyundai santa fe sport,9895.0,Clyde,125000.0,18,0.03327665339692032,1 | ||
facebook,chevrolet silverado z ltz pickup x,4495.0,"Katy, TX",112000.0,1,0.014676370351309698,1 | ||
craigslist,ram x cummins auto cloth new mt tires,25995.0,WWW.GETADIESEL.COM,146000.0,24,0.13080431946714566,0 | ||
facebook,audi q premium plus sport utility,2000.0,"Houston, TX",110000.0,1,0.18250311252758095,0 | ||
craigslist,ram slt x hemi leather b w hitch new,29900.0,WWW.GETADIESEL.COM,113000.0,24,0.14399505276303026,0 | ||
facebook,honda accord lx sedan,1700.0,"Houston, TX",30000.0,1,0.1641144807873059,0 | ||
craigslist,credit check toyota rav xle guaranteed approval,2500.0,www.DEPOTAUTOSALES.com,112000.0,9,0.002898395907380946,1 | ||
facebook,toyota camry,2999.0,"Houston, TX",70000.0,1,0.17909161271208118,0 | ||
craigslist,hyundai kona door hatchback,9895.0,Clyde,138000.0,21,0.03415303203722447,1 | ||
facebook,gmc yukon slt xl de enganche,5000.0,"Houston, TX",,1,0.18753136090512046,0 | ||
craigslist,ford lariat x diesel nav sunroof new tires,21800.0,WWW.GETADIESEL.COM,297000.0,24,0.09055779550252968,0 | ||
facebook,ram promaster,2999.0,"Houston, TX",100000.0,1,0.18713626161795982,0 | ||
craigslist,nissan rogue sv low miles,6500.0,Abilene,82000.0,10,0.06859122325442468,0 | ||
facebook,chevrolet silverado ls,4000.0,"Houston, TX",100000.0,1,0.18623941753769735,0 | ||
craigslist,ram power wagon x hemi nav winch ram boxes new,35995.0,WWW.GETADIESEL.COM,107000.0,24,0.14183774540922023,0 | ||
facebook,chevrolet silverado z ltz pickup x,,"Houston, TX",109000.0,1,0.18384758690876263,0 | ||
craigslist,nissan titan xd sl x v leather lift xds,29999.0,WWW.GETADIESEL.COM,90000.0,24,0.13336917160867512,0 | ||
facebook,toyota corolla se sedan,1699.0,"Houston, TX",45000.0,1,0.173492849128834,0 | ||
craigslist,gmc sierra hd sle x duramax x hostiles new,34990.0,WWW.GETADIESEL.COM,178000.0,24,0.13914141662536728,0 | ||
facebook,bmw series,2999.0,"Houston, TX",90000.0,1,0.1831797028453639,0 | ||
facebook,cadillac escalade,2000.0,"Houston, TX",59000.0,1,0.18063621572482608,0 | ||
facebook,ford ecoboost lifted de enganche,4900.0,"Houston, TX",,1,0.1874700834395005,0 | ||
facebook,honda civic sport sedan,1600.0,"Houston, TX",39000.0,1,0.16960915968965506,0 | ||
facebook,toyota corolla hatchback xse hatchback,2150.0,"Houston, TX",100000.0,1,0.18551098054038923,0 | ||
facebook,kia optima lx sedan,6800.0,"Humble, TX",91000.0,1,0.02860455322330069,1 | ||
facebook,toyota sienna se minivan,3000.0,"Houston, TX",120000.0,1,0.17907050597671365,0 | ||
facebook,toyota tacoma double cab trd sport pickup ft,3000.0,"Houston, TX",1000.0,1,0.14085733189109162,0 | ||
facebook,toyota tacoma double cab limited pickup ft,2999.0,"Houston, TX",90000.0,1,0.1831797028453639,0 | ||
facebook,honda civic dx sedan,5800.0,"Houston, TX",140000.0,1,0.164039503936043,0 | ||
facebook,toyota camry le sedan,6000.0,"Houston, TX",48000.0,1,0.1704424610556231,0 | ||
facebook,chrysler town country limited minivan,6950.0,"Houston, TX",140000.0,1,0.16174675471888023,0 | ||
facebook,ford mustang v coupe,10950.0,"Houston, TX",109000.0,1,0.17764476231710002,0 | ||
facebook,gmc sierra denali payment,5400.0,"Houston, TX",,1,0.18761996593143898,0 | ||
facebook,toyota corolla xse sedan,1699.0,"Houston, TX",41000.0,1,0.17447650468938009,0 | ||
facebook,nissan sentra sv,2999.0,"Houston, TX",180000.0,1,0.100369068821408,0 | ||
facebook,dodge charger,1200.0,"Houston, TX",82000.0,1,0.17329388620025316,0 | ||
facebook,chevrolet silverado crew cab lt pickup ft,22000.0,"Houston, TX",87000.0,1,0.15567646230885518,0 | ||
facebook,chevrolet silverado z,3000.0,"Houston, TX",,1,0.1876577958717826,0 | ||
facebook,toyota camry se sedan,1600.0,"Houston, TX",51000.0,1,0.1748493915305913,0 | ||
facebook,toyota camry se sedan,1699.0,"Houston, TX",27000.0,1,0.1630204107016024,0 | ||
facebook,honda civic,1600.0,"Houston, TX",38000.0,1,0.16960915968965506,0 | ||
facebook,chevrolet malibu lt sedan,1699.0,"Houston, TX",59000.0,1,0.17922929984416724,0 | ||
facebook,toyota corolla le eco premium sedan,9800.0,"Sugar Land, TX",74000.0,1,0.056526107666673986,0 | ||
facebook,toyota xle premium sport utility,2999.0,"Houston, TX",90000.0,1,0.1831797028453639,0 | ||
facebook,ford f supercrew cab platinum pickup ft,2995.0,"Katy, TX",8000.0,1,-0.006943396072467856,1 | ||
facebook,gmc canyon denali de enganche,4500.0,"Houston, TX",,1,0.18714564823462826,0 | ||
facebook,chrysler touring sedan,13800.0,"Garland, TX",70000.0,1,0.0180201601574127,1 | ||
facebook,toyota corolla le sedan,2000.0,"Arlington, TX",145000.0,1,0.05254372603546836,0 | ||
facebook,ford,3400.0,"Arlington, TX",79000.0,1,0.05778912416568971,0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,104 @@ | ||
import pandas as pd | ||
from sklearn.impute import SimpleImputer | ||
from sklearn.preprocessing import StandardScaler, OneHotEncoder | ||
from sklearn.compose import ColumnTransformer | ||
from sklearn.pipeline import Pipeline # Importing Pipeline | ||
from sklearn.ensemble import IsolationForest | ||
import nltk | ||
from nltk.tokenize import word_tokenize | ||
from nltk.corpus import stopwords | ||
import re | ||
|
||
# Load data | ||
data = pd.read_csv('data-1.csv') | ||
|
||
# 1. Feature Selection and Preprocessing | ||
features = data[['source', 'title', 'price', 'location', 'odometer']].copy() | ||
|
||
# New Feature: Count of images | ||
image_columns = [col for col in data.columns if col.startswith('images/')] | ||
data['image_count'] = data[image_columns].notna().sum(axis=1) | ||
features['image_count'] = data['image_count'] | ||
|
||
# Handling missing values in text columns | ||
features[['source', 'title', 'location']].fillna('', inplace=True) | ||
|
||
# Convert price and odometer to numerical values | ||
features['price'] = features['price'].replace('[\$,]', '', regex=True) | ||
features['price'] = pd.to_numeric(features['price'], errors='coerce') | ||
|
||
# Convert odometer to numerical values | ||
def convert_odometer(odometer_str): | ||
if pd.isna(odometer_str): | ||
return None | ||
number_part = re.findall(r'\d+', odometer_str) | ||
if number_part: | ||
number = int(number_part[0]) | ||
if 'k' in odometer_str.lower(): | ||
return number * 1000 # Convert 'k' to thousands | ||
return number | ||
return None | ||
|
||
features['odometer'] = features['odometer'].apply(convert_odometer) | ||
|
||
# Text preprocessing for 'title' | ||
nltk.download('punkt') | ||
nltk.download('stopwords') | ||
stop_words = set(stopwords.words('english')) | ||
|
||
def clean_text(text): | ||
text = text.lower() | ||
text = re.sub(r'\d+', '', text) # Remove numbers | ||
text = word_tokenize(text) | ||
text = [word for word in text if word.isalpha()] # Remove non-alphabetic tokens | ||
text = [word for word in text if not word in stop_words] # Remove stopwords | ||
return ' '.join(text) | ||
|
||
features['title'] = features['title'].apply(clean_text) | ||
|
||
# Preprocessing Pipeline | ||
num_features = ['price', 'odometer', 'image_count'] | ||
cat_features = ['source', 'location'] | ||
|
||
num_pipeline = Pipeline([ | ||
('imputer', SimpleImputer(strategy='median')), | ||
('scaler', StandardScaler()) | ||
]) | ||
|
||
preprocessor = ColumnTransformer( | ||
transformers=[ | ||
('num', num_pipeline, num_features), | ||
('cat', OneHotEncoder(), cat_features) | ||
]) | ||
|
||
# 2. Unsupervised Anomaly Detection | ||
# Applying Isolation Forest for anomaly detection | ||
model = IsolationForest(n_estimators=100, contamination='auto', random_state=42) | ||
features_preprocessed = preprocessor.fit_transform(features) | ||
model.fit(features_preprocessed) | ||
|
||
# 3. Score the listings and save results | ||
scores = model.decision_function(features_preprocessed) | ||
|
||
# Saving preprocessed, cleaned data as a new dataframe for better visibility. | ||
# Does not include feature scaling or one-hot encoding. | ||
model_data = features.copy() | ||
model_data['anomaly_score'] = scores | ||
|
||
# Add 'has_anomaly' column: 1 for scores <= 0.05 (potential scam), 0 otherwise | ||
model_data['has_anomaly'] = (scores <= 0.05).astype(int) | ||
|
||
# Saving this new DataFrame | ||
model_data.to_csv('model_data_with_anomalies.csv', index=False) | ||
|
||
# Count and print the number of listings with an anomaly score of .05 or less | ||
num_anomalous_listings = (scores <= .05).sum() | ||
print(f"Number of listings with anomaly score of 0.05 or less: {num_anomalous_listings}") | ||
|
||
scores_df = pd.DataFrame({ | ||
'listing_link': data['_id'], | ||
'anomaly_score': scores | ||
}) | ||
|
||
# Save the scores and links to a CSV file | ||
scores_df.to_csv('scores_and_links.csv', index=False) |
Oops, something went wrong.
09b1264
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Successfully deployed to the following URLs:
seniordesign – ./
smare.vercel.app
seniordesign-git-main-lryanle.vercel.app
seniordesign-lryanle.vercel.app
smare.lryanle.com