From 044d334ef9f6f910f7fe46dd0b54209d41b661c0 Mon Sep 17 00:00:00 2001 From: rasbt Date: Mon, 29 Dec 2014 00:26:25 -0500 Subject: [PATCH] function to process first and last names --- useful_scripts/preprocess_first_last_names.py | 82 +++++++++++++++++++ 1 file changed, 82 insertions(+) create mode 100644 useful_scripts/preprocess_first_last_names.py diff --git a/useful_scripts/preprocess_first_last_names.py b/useful_scripts/preprocess_first_last_names.py new file mode 100644 index 0000000..07d36f3 --- /dev/null +++ b/useful_scripts/preprocess_first_last_names.py @@ -0,0 +1,82 @@ +# Sebastian Raschka 2014 +# +# A Python function to generalize first and last names. +# The typical use case of such a function to merge data that have been collected +# from different sources (e.g., names of soccer players as shown in the doctest.) +# + +import unicodedata +import string +import re + +def preprocess_names(name, output_sep=' ', firstname_output_letters=1): + """ + Function that outputs a person's name in the format + (all lowercase) + + >>> preprocess_names("Samuel Eto'o") + 'etoo s' + + >>> preprocess_names("Eto'o, Samuel") + 'etoo s' + + >>> preprocess_names("Eto'o,Samuel") + 'etoo s' + + >>> preprocess_names('Xavi') + 'xavi' + + >>> preprocess_names('Yaya Touré') + 'toure y' + + >>> preprocess_names('José Ángel Pozo') + 'pozo j' + + >>> preprocess_names('Pozo, José Ángel') + 'pozo j' + + >>> preprocess_names('Pozo, José Ángel', firstname_output_letters=2) + 'pozo jo' + + >>> preprocess_names("Eto'o, Samuel", firstname_output_letters=2) + 'etoo sa' + + >>> preprocess_names("Eto'o, Samuel", firstname_output_letters=0) + 'etoo' + + >>> preprocess_names("Eto'o, Samuel", output_sep=', ') + 'etoo, s' + + """ + + # set first and last name positions + last, first = 'last', 'first' + last_pos = -1 + + if ',' in name: + last, first = first, last + name = name.replace(',', ' ') + last_pos = 1 + + spl = name.split() + if len(spl) > 2: + name = '%s %s' % (spl[0], spl[last_pos]) + + spl1, *spl2 = name.split() + '%s %s' % (spl1, ''.join(spl2)) + + # remove accents + name = ''.join(x for x in unicodedata.normalize('NFKD', name) if x in string.ascii_letters+' ') + + # get first and last name if applicable + m = re.match('(?P\w+)\W+(?P\w+)', name) + if m: + output = '%s%s%s' % (m.group(last), output_sep, m.group(first)[:firstname_output_letters]) + else: + output = name + return output.lower().strip() + + +if __name__ == "__main__": + import doctest + doctest.testmod() \ No newline at end of file