mirror of
https://github.com/rasbt/python_reference.git
synced 2024-11-23 20:11:13 +00:00
function to process first and last names
This commit is contained in:
parent
731425d794
commit
044d334ef9
82
useful_scripts/preprocess_first_last_names.py
Normal file
82
useful_scripts/preprocess_first_last_names.py
Normal file
|
@ -0,0 +1,82 @@
|
|||
# Sebastian Raschka 2014
|
||||
#
|
||||
# A Python function to generalize first and last names.
|
||||
# The typical use case of such a function to merge data that have been collected
|
||||
# from different sources (e.g., names of soccer players as shown in the doctest.)
|
||||
#
|
||||
|
||||
import unicodedata
|
||||
import string
|
||||
import re
|
||||
|
||||
def preprocess_names(name, output_sep=' ', firstname_output_letters=1):
|
||||
"""
|
||||
Function that outputs a person's name in the format
|
||||
<last_name><separator><firstname letter(s)> (all lowercase)
|
||||
|
||||
>>> preprocess_names("Samuel Eto'o")
|
||||
'etoo s'
|
||||
|
||||
>>> preprocess_names("Eto'o, Samuel")
|
||||
'etoo s'
|
||||
|
||||
>>> preprocess_names("Eto'o,Samuel")
|
||||
'etoo s'
|
||||
|
||||
>>> preprocess_names('Xavi')
|
||||
'xavi'
|
||||
|
||||
>>> preprocess_names('Yaya Touré')
|
||||
'toure y'
|
||||
|
||||
>>> preprocess_names('José Ángel Pozo')
|
||||
'pozo j'
|
||||
|
||||
>>> preprocess_names('Pozo, José Ángel')
|
||||
'pozo j'
|
||||
|
||||
>>> preprocess_names('Pozo, José Ángel', firstname_output_letters=2)
|
||||
'pozo jo'
|
||||
|
||||
>>> preprocess_names("Eto'o, Samuel", firstname_output_letters=2)
|
||||
'etoo sa'
|
||||
|
||||
>>> preprocess_names("Eto'o, Samuel", firstname_output_letters=0)
|
||||
'etoo'
|
||||
|
||||
>>> preprocess_names("Eto'o, Samuel", output_sep=', ')
|
||||
'etoo, s'
|
||||
|
||||
"""
|
||||
|
||||
# set first and last name positions
|
||||
last, first = 'last', 'first'
|
||||
last_pos = -1
|
||||
|
||||
if ',' in name:
|
||||
last, first = first, last
|
||||
name = name.replace(',', ' ')
|
||||
last_pos = 1
|
||||
|
||||
spl = name.split()
|
||||
if len(spl) > 2:
|
||||
name = '%s %s' % (spl[0], spl[last_pos])
|
||||
|
||||
spl1, *spl2 = name.split()
|
||||
'%s %s' % (spl1, ''.join(spl2))
|
||||
|
||||
# remove accents
|
||||
name = ''.join(x for x in unicodedata.normalize('NFKD', name) if x in string.ascii_letters+' ')
|
||||
|
||||
# get first and last name if applicable
|
||||
m = re.match('(?P<first>\w+)\W+(?P<last>\w+)', name)
|
||||
if m:
|
||||
output = '%s%s%s' % (m.group(last), output_sep, m.group(first)[:firstname_output_letters])
|
||||
else:
|
||||
output = name
|
||||
return output.lower().strip()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import doctest
|
||||
doctest.testmod()
|
Loading…
Reference in New Issue
Block a user