-
Notifications
You must be signed in to change notification settings - Fork 0
/
monthlyprediction.py
151 lines (124 loc) · 5.09 KB
/
monthlyprediction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from statsmodels.tsa.arima.model import ARIMA
from pmdarima import auto_arima
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np
import glob
import os
# Create a directory for the plots if it doesn't exist
if not os.path.exists('plots'):
os.makedirs('plots')
# Get all CSV files
all_files = glob.glob("comptage_velo_*.csv")
# Read and concatenate all CSV files
df_list = []
for filename in all_files:
df = pd.read_csv(filename)
year = filename.split('_')[-1].split('.')[0] # Extract year from filename
df['year'] = year
df_list.append(df)
df = pd.concat(df_list, ignore_index=True)
# Define a function to clean up the time format
def fix_time_format(time_str):
# If the time is already in HH:MM:SS format, return as is
if len(time_str.split(':')) == 3:
return time_str
# If the time is in HH:MM format, append ':00' to make it HH:MM:SS
elif len(time_str.split(':')) == 2:
return time_str + ':00'
return time_str
# Clean up the time format
df['datetime_str'] = df['date'] + ' ' + df['heure']
df['datetime_str'] = df['datetime_str'].apply(fix_time_format)
# Convert date and time to datetime
df['datetime'] = pd.to_datetime(df['datetime_str'], format='%Y-%m-%d %H:%M:%S', errors='coerce')
# Check if any datetime conversion failed and inspect those rows
invalid_dates = df[df['datetime'].isna()]
print(f"Invalid datetime rows:\n{invalid_dates[['datetime_str']]}")
# Aggregate data to monthly level
monthly_data = df.groupby(pd.Grouper(key='datetime', freq='M'))['nb_passages'].sum().reset_index()
# Split data into training (2019-2023) and testing (2024)
train_data = monthly_data[monthly_data['datetime'].dt.year < 2024]
test_data = monthly_data[monthly_data['datetime'].dt.year == 2024]
print(train_data.head())
print(test_data.head())
def millions_formatter(x, pos):
return f'{x/1e6:.1f}M'
# Plot the time series
plt.figure(figsize=(12,6))
plt.plot(train_data['datetime'], train_data['nb_passages'], label='Training Data')
plt.plot(test_data['datetime'], test_data['nb_passages'], label='Test Data', color='#2ca02c')
plt.title('Monthly Bike Trips')
plt.xlabel('Date')
plt.ylabel('Number of Trips')
plt.legend()
plt.gca().yaxis.set_major_formatter(ticker.FuncFormatter(millions_formatter))
plt.savefig('plots/monthly_bike_trips.png')
plt.close()
# Prepare the training data
y = train_data['nb_passages']
y.index = train_data['datetime']
# Use auto_arima to find the best parameters
model = auto_arima(y, start_p=1, start_q=1, max_p=3, max_q=3, m=12,
start_P=0, seasonal=True, d=1, D=1, trace=True,
error_action='ignore', suppress_warnings=True, stepwise=True)
# Fit the model
model.fit(y)
print(model.summary())
# Generate predictions for 2024
n_periods = len(test_data)
fc, confint = model.predict(n_periods=n_periods, return_conf_int=True)
# Create a dataframe with the forecasts
fc_series = pd.Series(fc, index=test_data['datetime'])
# Plot the results
plt.figure(figsize=(12,6))
plt.plot(train_data['datetime'], train_data['nb_passages'], label='Training Data', color='#1f77b4')
plt.plot(test_data['datetime'], test_data['nb_passages'], label='Actual 2024 Data', color='#2ca02c')
plt.plot(fc_series.index, fc_series, color='#d62728', label='Forecast')
plt.fill_between(fc_series.index,
confint[:, 0],
confint[:, 1],
color='#d62728', alpha=.15)
plt.title('Monthly Bike Trips - Forecast vs Actual')
plt.xlabel('Date')
plt.ylabel('Number of Trips')
plt.legend()
plt.gca().yaxis.set_major_formatter(ticker.FuncFormatter(millions_formatter))
plt.savefig('plots/forecast_vs_actual.png')
plt.close()
# Ensure alignment of fc and test_data
test_data = test_data.set_index('datetime')
fc_series = fc_series.reindex(test_data.index)
# Calculate residuals after ensuring alignment
residuals = test_data['nb_passages'] - fc_series
# Print lengths and first few values for debugging
print(f"Length of test_data['nb_passages']: {len(test_data['nb_passages'])}")
print(f"Length of fc_series: {len(fc_series)}")
print(f"First few values of test_data['nb_passages']:\n{test_data['nb_passages'].head()}")
print(f"First few values of fc_series:\n{fc_series.head()}")
print(f"Length of residuals: {len(residuals)}")
# Calculate error metrics
mae = mean_absolute_error(test_data['nb_passages'], fc_series)
rmse = np.sqrt(mean_squared_error(test_data['nb_passages'], fc_series))
mape = np.mean(np.abs((test_data['nb_passages'] - fc_series) / test_data['nb_passages'])) * 100
print(f'Mean Absolute Error: {mae}')
print(f'Root Mean Squared Error: {rmse}')
print(f'Mean Absolute Percentage Error: {mape}%')
# Plot residuals
plt.figure(figsize=(12,6))
plt.plot(test_data.index, residuals)
plt.title('Residuals')
plt.xlabel('Date')
plt.ylabel('Residual')
plt.savefig('plots/residuals.png')
plt.close()
# Plot residuals distribution
plt.figure(figsize=(12,6))
residuals.hist(bins=20)
plt.title('Distribution of Residuals')
plt.xlabel('Residual')
plt.ylabel('Frequency')
plt.savefig('plots/residuals_distribution.png')
plt.close()