2020-08-06 15:50:23 +00:00
|
|
|
"""
|
2020-08-04 20:11:07 +00:00
|
|
|
developed by: markmelnic
|
|
|
|
original repo: https://github.com/markmelnic/Scoring-Algorithm
|
|
|
|
|
|
|
|
Analyse data using a range based percentual proximity algorithm
|
|
|
|
and calculate the linear maximum likelihood estimation.
|
|
|
|
The basic principle is that all values supplied will be broken
|
|
|
|
down to a range from 0 to 1 and each column's score will be added
|
|
|
|
up to get the total score.
|
|
|
|
|
|
|
|
==========
|
|
|
|
Example for data of vehicles
|
|
|
|
price|mileage|registration_year
|
|
|
|
20k |60k |2012
|
|
|
|
22k |50k |2011
|
|
|
|
23k |90k |2015
|
|
|
|
16k |210k |2010
|
|
|
|
|
|
|
|
We want the vehicle with the lowest price,
|
|
|
|
lowest mileage but newest registration year.
|
|
|
|
Thus the weights for each column are as follows:
|
|
|
|
[0, 0, 1]
|
2020-08-06 15:50:23 +00:00
|
|
|
"""
|
2020-08-04 20:11:07 +00:00
|
|
|
|
|
|
|
|
2021-10-29 05:21:16 +00:00
|
|
|
def procentual_proximity(
|
|
|
|
source_data: list[list[float]], weights: list[int]
|
|
|
|
) -> list[list[float]]:
|
2020-08-04 20:11:07 +00:00
|
|
|
|
2020-08-06 15:50:23 +00:00
|
|
|
"""
|
2020-08-04 20:11:07 +00:00
|
|
|
weights - int list
|
|
|
|
possible values - 0 / 1
|
|
|
|
0 if lower values have higher weight in the data set
|
|
|
|
1 if higher values have higher weight in the data set
|
2021-10-29 05:21:16 +00:00
|
|
|
|
|
|
|
>>> procentual_proximity([[20, 60, 2012],[23, 90, 2015],[22, 50, 2011]], [0, 0, 1])
|
|
|
|
[[20, 60, 2012, 2.0], [23, 90, 2015, 1.0], [22, 50, 2011, 1.3333333333333335]]
|
2020-08-06 15:50:23 +00:00
|
|
|
"""
|
2020-08-04 20:11:07 +00:00
|
|
|
|
|
|
|
# getting data
|
2021-10-29 05:21:16 +00:00
|
|
|
data_lists: list[list[float]] = []
|
|
|
|
for data in source_data:
|
|
|
|
for i, el in enumerate(data):
|
|
|
|
if len(data_lists) < i + 1:
|
2020-08-04 20:11:07 +00:00
|
|
|
data_lists.append([])
|
2021-10-29 05:21:16 +00:00
|
|
|
data_lists[i].append(float(el))
|
2020-08-04 20:11:07 +00:00
|
|
|
|
2021-10-29 05:21:16 +00:00
|
|
|
score_lists: list[list[float]] = []
|
2020-08-04 20:11:07 +00:00
|
|
|
# calculating each score
|
|
|
|
for dlist, weight in zip(data_lists, weights):
|
|
|
|
mind = min(dlist)
|
|
|
|
maxd = max(dlist)
|
|
|
|
|
2021-10-29 05:21:16 +00:00
|
|
|
score: list[float] = []
|
2020-08-04 20:11:07 +00:00
|
|
|
# for weight 0 score is 1 - actual score
|
|
|
|
if weight == 0:
|
|
|
|
for item in dlist:
|
|
|
|
try:
|
|
|
|
score.append(1 - ((item - mind) / (maxd - mind)))
|
|
|
|
except ZeroDivisionError:
|
|
|
|
score.append(1)
|
|
|
|
|
|
|
|
elif weight == 1:
|
|
|
|
for item in dlist:
|
|
|
|
try:
|
|
|
|
score.append((item - mind) / (maxd - mind))
|
|
|
|
except ZeroDivisionError:
|
|
|
|
score.append(0)
|
|
|
|
|
|
|
|
# weight not 0 or 1
|
|
|
|
else:
|
2022-07-07 14:34:07 +00:00
|
|
|
raise ValueError(f"Invalid weight of {weight:f} provided")
|
2020-08-04 20:11:07 +00:00
|
|
|
|
|
|
|
score_lists.append(score)
|
|
|
|
|
|
|
|
# initialize final scores
|
2021-10-29 05:21:16 +00:00
|
|
|
final_scores: list[float] = [0 for i in range(len(score_lists[0]))]
|
2020-08-04 20:11:07 +00:00
|
|
|
|
|
|
|
# generate final scores
|
|
|
|
for i, slist in enumerate(score_lists):
|
|
|
|
for j, ele in enumerate(slist):
|
|
|
|
final_scores[j] = final_scores[j] + ele
|
|
|
|
|
|
|
|
# append scores to source data
|
|
|
|
for i, ele in enumerate(final_scores):
|
|
|
|
source_data[i].append(ele)
|
|
|
|
|
|
|
|
return source_data
|