mirror of
https://github.com/TheAlgorithms/Python.git
synced 2024-11-23 21:11:08 +00:00
Forecast (#3219)
* add forecasting code * add statsmodel * sort import * sort import fix * fixing black * sort requirement * optimize code * try with limited data * sort again * sort fix * sort fix * delete warning and black * add code for forecasting * use black * add more hints to describe * add doctest * finding whitespace * fixing doctest * delete * revert back * revert back * revert back again * revert back again * revert back again * try trimming whitespace * try adding doctypeand etc * fixing reviews * deleting all the space * fixing the build * delete x * add description for safety checker * deleting subscription integer * fix docthint * make def to use function parameters and return values * make def to use function parameters and return values * type hints on data safety checker * optimize code * Update run.py Co-authored-by: FVFYK3GEHV22 <fvfyk3gehv22@FVFYK3GEHV22s-MacBook-Pro.local> Co-authored-by: Christian Clauss <cclauss@me.com>
This commit is contained in:
parent
b97529dd88
commit
12c69800bd
0
machine_learning/forecasting/__init__.py
Normal file
0
machine_learning/forecasting/__init__.py
Normal file
114
machine_learning/forecasting/ex_data.csv
Normal file
114
machine_learning/forecasting/ex_data.csv
Normal file
|
@ -0,0 +1,114 @@
|
|||
total_user,total_events,days
|
||||
18231,0.0,1
|
||||
22621,1.0,2
|
||||
15675,0.0,3
|
||||
23583,1.0,4
|
||||
68351,5.0,5
|
||||
34338,3.0,6
|
||||
19238,0.0,0
|
||||
24192,0.0,1
|
||||
70349,0.0,2
|
||||
103510,0.0,3
|
||||
128355,1.0,4
|
||||
148484,6.0,5
|
||||
153489,3.0,6
|
||||
162667,1.0,0
|
||||
311430,3.0,1
|
||||
435663,7.0,2
|
||||
273526,0.0,3
|
||||
628588,2.0,4
|
||||
454989,13.0,5
|
||||
539040,3.0,6
|
||||
52974,1.0,0
|
||||
103451,2.0,1
|
||||
810020,5.0,2
|
||||
580982,3.0,3
|
||||
216515,0.0,4
|
||||
134694,10.0,5
|
||||
93563,1.0,6
|
||||
55432,1.0,0
|
||||
169634,1.0,1
|
||||
254908,4.0,2
|
||||
315285,3.0,3
|
||||
191764,0.0,4
|
||||
514284,7.0,5
|
||||
181214,4.0,6
|
||||
78459,2.0,0
|
||||
161620,3.0,1
|
||||
245610,4.0,2
|
||||
326722,5.0,3
|
||||
214578,0.0,4
|
||||
312365,5.0,5
|
||||
232454,4.0,6
|
||||
178368,1.0,0
|
||||
97152,1.0,1
|
||||
222813,4.0,2
|
||||
285852,4.0,3
|
||||
192149,1.0,4
|
||||
142241,1.0,5
|
||||
173011,2.0,6
|
||||
56488,3.0,0
|
||||
89572,2.0,1
|
||||
356082,2.0,2
|
||||
172799,0.0,3
|
||||
142300,1.0,4
|
||||
78432,2.0,5
|
||||
539023,9.0,6
|
||||
62389,1.0,0
|
||||
70247,1.0,1
|
||||
89229,0.0,2
|
||||
94583,1.0,3
|
||||
102455,0.0,4
|
||||
129270,0.0,5
|
||||
311409,1.0,6
|
||||
1837026,0.0,0
|
||||
361824,0.0,1
|
||||
111379,2.0,2
|
||||
76337,2.0,3
|
||||
96747,0.0,4
|
||||
92058,0.0,5
|
||||
81929,2.0,6
|
||||
143423,0.0,0
|
||||
82939,0.0,1
|
||||
74403,1.0,2
|
||||
68234,0.0,3
|
||||
94556,1.0,4
|
||||
80311,0.0,5
|
||||
75283,3.0,6
|
||||
77724,0.0,0
|
||||
49229,2.0,1
|
||||
65708,2.0,2
|
||||
273864,1.0,3
|
||||
1711281,0.0,4
|
||||
1900253,5.0,5
|
||||
343071,1.0,6
|
||||
1551326,0.0,0
|
||||
56636,1.0,1
|
||||
272782,2.0,2
|
||||
1785678,0.0,3
|
||||
241866,0.0,4
|
||||
461904,0.0,5
|
||||
2191901,2.0,6
|
||||
102925,0.0,0
|
||||
242778,1.0,1
|
||||
298608,0.0,2
|
||||
322458,10.0,3
|
||||
216027,9.0,4
|
||||
916052,12.0,5
|
||||
193278,12.0,6
|
||||
263207,8.0,0
|
||||
672948,10.0,1
|
||||
281909,1.0,2
|
||||
384562,1.0,3
|
||||
1027375,2.0,4
|
||||
828905,9.0,5
|
||||
624188,22.0,6
|
||||
392218,8.0,0
|
||||
292581,10.0,1
|
||||
299869,12.0,2
|
||||
769455,20.0,3
|
||||
316443,8.0,4
|
||||
1212864,24.0,5
|
||||
1397338,28.0,6
|
||||
223249,8.0,0
|
||||
191264,14.0,1
|
|
156
machine_learning/forecasting/run.py
Normal file
156
machine_learning/forecasting/run.py
Normal file
|
@ -0,0 +1,156 @@
|
|||
"""
|
||||
this is code for forecasting
|
||||
but i modified it and used it for safety checker of data
|
||||
for ex: you have a online shop and for some reason some data are
|
||||
missing (the amount of data that u expected are not supposed to be)
|
||||
then we can use it
|
||||
*ps : 1. ofc we can use normal statistic method but in this case
|
||||
the data is quite absurd and only a little^^
|
||||
2. ofc u can use this and modified it for forecasting purpose
|
||||
for the next 3 months sales or something,
|
||||
u can just adjust it for ur own purpose
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from sklearn.preprocessing import Normalizer
|
||||
from sklearn.svm import SVR
|
||||
from statsmodels.tsa.statespace.sarimax import SARIMAX
|
||||
|
||||
|
||||
def linear_regression_prediction(
|
||||
train_dt: list, train_usr: list, train_mtch: list, test_dt: list, test_mtch: list
|
||||
) -> float:
|
||||
"""
|
||||
First method: linear regression
|
||||
input : training data (date, total_user, total_event) in list of float
|
||||
output : list of total user prediction in float
|
||||
>>> linear_regression_prediction([2,3,4,5], [5,3,4,6], [3,1,2,4], [2,1], [2,2])
|
||||
5.000000000000003
|
||||
"""
|
||||
x = [[1, item, train_mtch[i]] for i, item in enumerate(train_dt)]
|
||||
x = np.array(x)
|
||||
y = np.array(train_usr)
|
||||
beta = np.dot(np.dot(np.linalg.inv(np.dot(x.transpose(), x)), x.transpose()), y)
|
||||
return abs(beta[0] + test_dt[0] * beta[1] + test_mtch[0] + beta[2])
|
||||
|
||||
|
||||
def sarimax_predictor(train_user: list, train_match: list, test_match: list) -> float:
|
||||
"""
|
||||
second method: Sarimax
|
||||
sarimax is a statistic method which using previous input
|
||||
and learn its pattern to predict future data
|
||||
input : training data (total_user, with exog data = total_event) in list of float
|
||||
output : list of total user prediction in float
|
||||
>>> sarimax_predictor([4,2,6,8], [3,1,2,4], [2])
|
||||
6.6666671111109626
|
||||
"""
|
||||
order = (1, 2, 1)
|
||||
seasonal_order = (1, 1, 0, 7)
|
||||
model = SARIMAX(
|
||||
train_user, exog=train_match, order=order, seasonal_order=seasonal_order
|
||||
)
|
||||
model_fit = model.fit(disp=False, maxiter=600, method="nm")
|
||||
result = model_fit.predict(1, len(test_match), exog=[test_match])
|
||||
return result[0]
|
||||
|
||||
|
||||
def support_vector_regressor(x_train: list, x_test: list, train_user: list) -> float:
|
||||
"""
|
||||
Third method: Support vector regressor
|
||||
svr is quite the same with svm(support vector machine)
|
||||
it uses the same principles as the SVM for classification,
|
||||
with only a few minor differences and the only different is that
|
||||
it suits better for regression purpose
|
||||
input : training data (date, total_user, total_event) in list of float
|
||||
where x = list of set (date and total event)
|
||||
output : list of total user prediction in float
|
||||
>>> support_vector_regressor([[5,2],[1,5],[6,2]], [[3,2]], [2,1,4])
|
||||
1.634932078116079
|
||||
"""
|
||||
regressor = SVR(kernel="rbf", C=1, gamma=0.1, epsilon=0.1)
|
||||
regressor.fit(x_train, train_user)
|
||||
y_pred = regressor.predict(x_test)
|
||||
return y_pred[0]
|
||||
|
||||
|
||||
def interquartile_range_checker(train_user: list) -> float:
|
||||
"""
|
||||
Optional method: interquatile range
|
||||
input : list of total user in float
|
||||
output : low limit of input in float
|
||||
this method can be used to check whether some data is outlier or not
|
||||
>>> interquartile_range_checker([1,2,3,4,5,6,7,8,9,10])
|
||||
2.8
|
||||
"""
|
||||
train_user.sort()
|
||||
q1 = np.percentile(train_user, 25)
|
||||
q3 = np.percentile(train_user, 75)
|
||||
iqr = q3 - q1
|
||||
low_lim = q1 - (iqr * 0.1)
|
||||
return low_lim
|
||||
|
||||
|
||||
def data_safety_checker(list_vote: list, actual_result: float) -> None:
|
||||
"""
|
||||
Used to review all the votes (list result prediction)
|
||||
and compare it to the actual result.
|
||||
input : list of predictions
|
||||
output : print whether it's safe or not
|
||||
>>> data_safety_checker([2,3,4],5.0)
|
||||
Today's data is not safe.
|
||||
"""
|
||||
safe = 0
|
||||
not_safe = 0
|
||||
for i in list_vote:
|
||||
if i > actual_result:
|
||||
safe = not_safe + 1
|
||||
else:
|
||||
if abs(abs(i) - abs(actual_result)) <= 0.1:
|
||||
safe = safe + 1
|
||||
else:
|
||||
not_safe = not_safe + 1
|
||||
print(f"Today's data is {'not ' if safe <= not_safe else ''}safe.")
|
||||
|
||||
|
||||
# data_input_df = pd.read_csv("ex_data.csv", header=None)
|
||||
data_input = [[18231, 0.0, 1], [22621, 1.0, 2], [15675, 0.0, 3], [23583, 1.0, 4]]
|
||||
data_input_df = pd.DataFrame(data_input, columns=["total_user", "total_even", "days"])
|
||||
|
||||
"""
|
||||
data column = total user in a day, how much online event held in one day,
|
||||
what day is that(sunday-saturday)
|
||||
"""
|
||||
|
||||
# start normalization
|
||||
normalize_df = Normalizer().fit_transform(data_input_df.values)
|
||||
# split data
|
||||
total_date = normalize_df[:, 2].tolist()
|
||||
total_user = normalize_df[:, 0].tolist()
|
||||
total_match = normalize_df[:, 1].tolist()
|
||||
|
||||
# for svr (input variable = total date and total match)
|
||||
x = normalize_df[:, [1, 2]].tolist()
|
||||
x_train = x[: len(x) - 1]
|
||||
x_test = x[len(x) - 1 :]
|
||||
|
||||
# for linear reression & sarimax
|
||||
trn_date = total_date[: len(total_date) - 1]
|
||||
trn_user = total_user[: len(total_user) - 1]
|
||||
trn_match = total_match[: len(total_match) - 1]
|
||||
|
||||
tst_date = total_date[len(total_date) - 1 :]
|
||||
tst_user = total_user[len(total_user) - 1 :]
|
||||
tst_match = total_match[len(total_match) - 1 :]
|
||||
|
||||
|
||||
# voting system with forecasting
|
||||
res_vote = []
|
||||
res_vote.append(
|
||||
linear_regression_prediction(trn_date, trn_user, trn_match, tst_date, tst_match)
|
||||
)
|
||||
res_vote.append(sarimax_predictor(trn_user, trn_match, tst_match))
|
||||
res_vote.append(support_vector_regressor(x_train, x_test, trn_user))
|
||||
|
||||
# check the safety of todays'data^^
|
||||
data_safety_checker(res_vote, tst_user)
|
|
@ -11,6 +11,7 @@ qiskit
|
|||
requests
|
||||
scikit-fuzzy
|
||||
sklearn
|
||||
statsmodels
|
||||
sympy
|
||||
tensorflow
|
||||
xgboost
|
||||
|
|
Loading…
Reference in New Issue
Block a user