mirror of
https://github.com/rasbt/python_reference.git
synced 2024-12-18 00:00:19 +00:00
function to process first and last names
This commit is contained in:
parent
731425d794
commit
044d334ef9
82
useful_scripts/preprocess_first_last_names.py
Normal file
82
useful_scripts/preprocess_first_last_names.py
Normal file
|
@ -0,0 +1,82 @@
|
||||||
|
# Sebastian Raschka 2014
|
||||||
|
#
|
||||||
|
# A Python function to generalize first and last names.
|
||||||
|
# The typical use case of such a function to merge data that have been collected
|
||||||
|
# from different sources (e.g., names of soccer players as shown in the doctest.)
|
||||||
|
#
|
||||||
|
|
||||||
|
import unicodedata
|
||||||
|
import string
|
||||||
|
import re
|
||||||
|
|
||||||
|
def preprocess_names(name, output_sep=' ', firstname_output_letters=1):
|
||||||
|
"""
|
||||||
|
Function that outputs a person's name in the format
|
||||||
|
<last_name><separator><firstname letter(s)> (all lowercase)
|
||||||
|
|
||||||
|
>>> preprocess_names("Samuel Eto'o")
|
||||||
|
'etoo s'
|
||||||
|
|
||||||
|
>>> preprocess_names("Eto'o, Samuel")
|
||||||
|
'etoo s'
|
||||||
|
|
||||||
|
>>> preprocess_names("Eto'o,Samuel")
|
||||||
|
'etoo s'
|
||||||
|
|
||||||
|
>>> preprocess_names('Xavi')
|
||||||
|
'xavi'
|
||||||
|
|
||||||
|
>>> preprocess_names('Yaya Touré')
|
||||||
|
'toure y'
|
||||||
|
|
||||||
|
>>> preprocess_names('José Ángel Pozo')
|
||||||
|
'pozo j'
|
||||||
|
|
||||||
|
>>> preprocess_names('Pozo, José Ángel')
|
||||||
|
'pozo j'
|
||||||
|
|
||||||
|
>>> preprocess_names('Pozo, José Ángel', firstname_output_letters=2)
|
||||||
|
'pozo jo'
|
||||||
|
|
||||||
|
>>> preprocess_names("Eto'o, Samuel", firstname_output_letters=2)
|
||||||
|
'etoo sa'
|
||||||
|
|
||||||
|
>>> preprocess_names("Eto'o, Samuel", firstname_output_letters=0)
|
||||||
|
'etoo'
|
||||||
|
|
||||||
|
>>> preprocess_names("Eto'o, Samuel", output_sep=', ')
|
||||||
|
'etoo, s'
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
# set first and last name positions
|
||||||
|
last, first = 'last', 'first'
|
||||||
|
last_pos = -1
|
||||||
|
|
||||||
|
if ',' in name:
|
||||||
|
last, first = first, last
|
||||||
|
name = name.replace(',', ' ')
|
||||||
|
last_pos = 1
|
||||||
|
|
||||||
|
spl = name.split()
|
||||||
|
if len(spl) > 2:
|
||||||
|
name = '%s %s' % (spl[0], spl[last_pos])
|
||||||
|
|
||||||
|
spl1, *spl2 = name.split()
|
||||||
|
'%s %s' % (spl1, ''.join(spl2))
|
||||||
|
|
||||||
|
# remove accents
|
||||||
|
name = ''.join(x for x in unicodedata.normalize('NFKD', name) if x in string.ascii_letters+' ')
|
||||||
|
|
||||||
|
# get first and last name if applicable
|
||||||
|
m = re.match('(?P<first>\w+)\W+(?P<last>\w+)', name)
|
||||||
|
if m:
|
||||||
|
output = '%s%s%s' % (m.group(last), output_sep, m.group(first)[:firstname_output_letters])
|
||||||
|
else:
|
||||||
|
output = name
|
||||||
|
return output.lower().strip()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import doctest
|
||||||
|
doctest.testmod()
|
Loading…
Reference in New Issue
Block a user