* add forecasting code

* add statsmodel

* sort import

* sort import fix

* fixing black

* sort requirement

* optimize code

* try with limited data

* sort again

* sort fix

* sort fix

* delete warning and black

* add code for forecasting

* use black

* add more hints to describe

* add doctest

* finding whitespace

* fixing doctest

* delete

* revert back

* revert back

* revert back again

* revert back again

* revert back again

* try trimming whitespace

* try adding doctypeand etc

* fixing reviews

* deleting all the space

* fixing the build

* delete x

* add description for safety checker

* deleting subscription integer

* fix docthint

* make def to use function parameters and return values

* make def to use function parameters and return values

* type hints on data safety checker

* optimize code

* Update run.py

Co-authored-by: FVFYK3GEHV22 <fvfyk3gehv22@FVFYK3GEHV22s-MacBook-Pro.local>
Co-authored-by: Christian Clauss <cclauss@me.com>
This commit is contained in:
Nandiya 2020-10-24 21:07:27 +07:00 committed by GitHub
parent b97529dd88
commit 12c69800bd
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 271 additions and 0 deletions

View File

View File

@ -0,0 +1,114 @@
total_user,total_events,days
18231,0.0,1
22621,1.0,2
15675,0.0,3
23583,1.0,4
68351,5.0,5
34338,3.0,6
19238,0.0,0
24192,0.0,1
70349,0.0,2
103510,0.0,3
128355,1.0,4
148484,6.0,5
153489,3.0,6
162667,1.0,0
311430,3.0,1
435663,7.0,2
273526,0.0,3
628588,2.0,4
454989,13.0,5
539040,3.0,6
52974,1.0,0
103451,2.0,1
810020,5.0,2
580982,3.0,3
216515,0.0,4
134694,10.0,5
93563,1.0,6
55432,1.0,0
169634,1.0,1
254908,4.0,2
315285,3.0,3
191764,0.0,4
514284,7.0,5
181214,4.0,6
78459,2.0,0
161620,3.0,1
245610,4.0,2
326722,5.0,3
214578,0.0,4
312365,5.0,5
232454,4.0,6
178368,1.0,0
97152,1.0,1
222813,4.0,2
285852,4.0,3
192149,1.0,4
142241,1.0,5
173011,2.0,6
56488,3.0,0
89572,2.0,1
356082,2.0,2
172799,0.0,3
142300,1.0,4
78432,2.0,5
539023,9.0,6
62389,1.0,0
70247,1.0,1
89229,0.0,2
94583,1.0,3
102455,0.0,4
129270,0.0,5
311409,1.0,6
1837026,0.0,0
361824,0.0,1
111379,2.0,2
76337,2.0,3
96747,0.0,4
92058,0.0,5
81929,2.0,6
143423,0.0,0
82939,0.0,1
74403,1.0,2
68234,0.0,3
94556,1.0,4
80311,0.0,5
75283,3.0,6
77724,0.0,0
49229,2.0,1
65708,2.0,2
273864,1.0,3
1711281,0.0,4
1900253,5.0,5
343071,1.0,6
1551326,0.0,0
56636,1.0,1
272782,2.0,2
1785678,0.0,3
241866,0.0,4
461904,0.0,5
2191901,2.0,6
102925,0.0,0
242778,1.0,1
298608,0.0,2
322458,10.0,3
216027,9.0,4
916052,12.0,5
193278,12.0,6
263207,8.0,0
672948,10.0,1
281909,1.0,2
384562,1.0,3
1027375,2.0,4
828905,9.0,5
624188,22.0,6
392218,8.0,0
292581,10.0,1
299869,12.0,2
769455,20.0,3
316443,8.0,4
1212864,24.0,5
1397338,28.0,6
223249,8.0,0
191264,14.0,1
1 total_user total_events days
2 18231 0.0 1
3 22621 1.0 2
4 15675 0.0 3
5 23583 1.0 4
6 68351 5.0 5
7 34338 3.0 6
8 19238 0.0 0
9 24192 0.0 1
10 70349 0.0 2
11 103510 0.0 3
12 128355 1.0 4
13 148484 6.0 5
14 153489 3.0 6
15 162667 1.0 0
16 311430 3.0 1
17 435663 7.0 2
18 273526 0.0 3
19 628588 2.0 4
20 454989 13.0 5
21 539040 3.0 6
22 52974 1.0 0
23 103451 2.0 1
24 810020 5.0 2
25 580982 3.0 3
26 216515 0.0 4
27 134694 10.0 5
28 93563 1.0 6
29 55432 1.0 0
30 169634 1.0 1
31 254908 4.0 2
32 315285 3.0 3
33 191764 0.0 4
34 514284 7.0 5
35 181214 4.0 6
36 78459 2.0 0
37 161620 3.0 1
38 245610 4.0 2
39 326722 5.0 3
40 214578 0.0 4
41 312365 5.0 5
42 232454 4.0 6
43 178368 1.0 0
44 97152 1.0 1
45 222813 4.0 2
46 285852 4.0 3
47 192149 1.0 4
48 142241 1.0 5
49 173011 2.0 6
50 56488 3.0 0
51 89572 2.0 1
52 356082 2.0 2
53 172799 0.0 3
54 142300 1.0 4
55 78432 2.0 5
56 539023 9.0 6
57 62389 1.0 0
58 70247 1.0 1
59 89229 0.0 2
60 94583 1.0 3
61 102455 0.0 4
62 129270 0.0 5
63 311409 1.0 6
64 1837026 0.0 0
65 361824 0.0 1
66 111379 2.0 2
67 76337 2.0 3
68 96747 0.0 4
69 92058 0.0 5
70 81929 2.0 6
71 143423 0.0 0
72 82939 0.0 1
73 74403 1.0 2
74 68234 0.0 3
75 94556 1.0 4
76 80311 0.0 5
77 75283 3.0 6
78 77724 0.0 0
79 49229 2.0 1
80 65708 2.0 2
81 273864 1.0 3
82 1711281 0.0 4
83 1900253 5.0 5
84 343071 1.0 6
85 1551326 0.0 0
86 56636 1.0 1
87 272782 2.0 2
88 1785678 0.0 3
89 241866 0.0 4
90 461904 0.0 5
91 2191901 2.0 6
92 102925 0.0 0
93 242778 1.0 1
94 298608 0.0 2
95 322458 10.0 3
96 216027 9.0 4
97 916052 12.0 5
98 193278 12.0 6
99 263207 8.0 0
100 672948 10.0 1
101 281909 1.0 2
102 384562 1.0 3
103 1027375 2.0 4
104 828905 9.0 5
105 624188 22.0 6
106 392218 8.0 0
107 292581 10.0 1
108 299869 12.0 2
109 769455 20.0 3
110 316443 8.0 4
111 1212864 24.0 5
112 1397338 28.0 6
113 223249 8.0 0
114 191264 14.0 1

View File

@ -0,0 +1,156 @@
"""
this is code for forecasting
but i modified it and used it for safety checker of data
for ex: you have a online shop and for some reason some data are
missing (the amount of data that u expected are not supposed to be)
then we can use it
*ps : 1. ofc we can use normal statistic method but in this case
the data is quite absurd and only a little^^
2. ofc u can use this and modified it for forecasting purpose
for the next 3 months sales or something,
u can just adjust it for ur own purpose
"""
import numpy as np
import pandas as pd
from sklearn.preprocessing import Normalizer
from sklearn.svm import SVR
from statsmodels.tsa.statespace.sarimax import SARIMAX
def linear_regression_prediction(
train_dt: list, train_usr: list, train_mtch: list, test_dt: list, test_mtch: list
) -> float:
"""
First method: linear regression
input : training data (date, total_user, total_event) in list of float
output : list of total user prediction in float
>>> linear_regression_prediction([2,3,4,5], [5,3,4,6], [3,1,2,4], [2,1], [2,2])
5.000000000000003
"""
x = [[1, item, train_mtch[i]] for i, item in enumerate(train_dt)]
x = np.array(x)
y = np.array(train_usr)
beta = np.dot(np.dot(np.linalg.inv(np.dot(x.transpose(), x)), x.transpose()), y)
return abs(beta[0] + test_dt[0] * beta[1] + test_mtch[0] + beta[2])
def sarimax_predictor(train_user: list, train_match: list, test_match: list) -> float:
"""
second method: Sarimax
sarimax is a statistic method which using previous input
and learn its pattern to predict future data
input : training data (total_user, with exog data = total_event) in list of float
output : list of total user prediction in float
>>> sarimax_predictor([4,2,6,8], [3,1,2,4], [2])
6.6666671111109626
"""
order = (1, 2, 1)
seasonal_order = (1, 1, 0, 7)
model = SARIMAX(
train_user, exog=train_match, order=order, seasonal_order=seasonal_order
)
model_fit = model.fit(disp=False, maxiter=600, method="nm")
result = model_fit.predict(1, len(test_match), exog=[test_match])
return result[0]
def support_vector_regressor(x_train: list, x_test: list, train_user: list) -> float:
"""
Third method: Support vector regressor
svr is quite the same with svm(support vector machine)
it uses the same principles as the SVM for classification,
with only a few minor differences and the only different is that
it suits better for regression purpose
input : training data (date, total_user, total_event) in list of float
where x = list of set (date and total event)
output : list of total user prediction in float
>>> support_vector_regressor([[5,2],[1,5],[6,2]], [[3,2]], [2,1,4])
1.634932078116079
"""
regressor = SVR(kernel="rbf", C=1, gamma=0.1, epsilon=0.1)
regressor.fit(x_train, train_user)
y_pred = regressor.predict(x_test)
return y_pred[0]
def interquartile_range_checker(train_user: list) -> float:
"""
Optional method: interquatile range
input : list of total user in float
output : low limit of input in float
this method can be used to check whether some data is outlier or not
>>> interquartile_range_checker([1,2,3,4,5,6,7,8,9,10])
2.8
"""
train_user.sort()
q1 = np.percentile(train_user, 25)
q3 = np.percentile(train_user, 75)
iqr = q3 - q1
low_lim = q1 - (iqr * 0.1)
return low_lim
def data_safety_checker(list_vote: list, actual_result: float) -> None:
"""
Used to review all the votes (list result prediction)
and compare it to the actual result.
input : list of predictions
output : print whether it's safe or not
>>> data_safety_checker([2,3,4],5.0)
Today's data is not safe.
"""
safe = 0
not_safe = 0
for i in list_vote:
if i > actual_result:
safe = not_safe + 1
else:
if abs(abs(i) - abs(actual_result)) <= 0.1:
safe = safe + 1
else:
not_safe = not_safe + 1
print(f"Today's data is {'not ' if safe <= not_safe else ''}safe.")
# data_input_df = pd.read_csv("ex_data.csv", header=None)
data_input = [[18231, 0.0, 1], [22621, 1.0, 2], [15675, 0.0, 3], [23583, 1.0, 4]]
data_input_df = pd.DataFrame(data_input, columns=["total_user", "total_even", "days"])
"""
data column = total user in a day, how much online event held in one day,
what day is that(sunday-saturday)
"""
# start normalization
normalize_df = Normalizer().fit_transform(data_input_df.values)
# split data
total_date = normalize_df[:, 2].tolist()
total_user = normalize_df[:, 0].tolist()
total_match = normalize_df[:, 1].tolist()
# for svr (input variable = total date and total match)
x = normalize_df[:, [1, 2]].tolist()
x_train = x[: len(x) - 1]
x_test = x[len(x) - 1 :]
# for linear reression & sarimax
trn_date = total_date[: len(total_date) - 1]
trn_user = total_user[: len(total_user) - 1]
trn_match = total_match[: len(total_match) - 1]
tst_date = total_date[len(total_date) - 1 :]
tst_user = total_user[len(total_user) - 1 :]
tst_match = total_match[len(total_match) - 1 :]
# voting system with forecasting
res_vote = []
res_vote.append(
linear_regression_prediction(trn_date, trn_user, trn_match, tst_date, tst_match)
)
res_vote.append(sarimax_predictor(trn_user, trn_match, tst_match))
res_vote.append(support_vector_regressor(x_train, x_test, trn_user))
# check the safety of todays'data^^
data_safety_checker(res_vote, tst_user)

View File

@ -11,6 +11,7 @@ qiskit
requests
scikit-fuzzy
sklearn
statsmodels
sympy
tensorflow
xgboost