From 044d334ef9f6f910f7fe46dd0b54209d41b661c0 Mon Sep 17 00:00:00 2001
From: rasbt <se.raschka@me.com>
Date: Mon, 29 Dec 2014 00:26:25 -0500
Subject: [PATCH] function to process first and last names

---
 useful_scripts/preprocess_first_last_names.py | 82 +++++++++++++++++++
 1 file changed, 82 insertions(+)
 create mode 100644 useful_scripts/preprocess_first_last_names.py
diff --git a/useful_scripts/preprocess_first_last_names.py b/useful_scripts/preprocess_first_last_names.py
new file mode 100644
index 0000000..07d36f3
--- /dev/null
+++ b/useful_scripts/preprocess_first_last_names.py
@@ -0,0 +1,82 @@
+# Sebastian Raschka 2014
+#
+# A Python function to generalize first and last names.
+# The typical use case of such a function to merge data that have been collected 
+# from different sources (e.g., names of soccer players as shown in the doctest.)
+# 
+
+import unicodedata
+import string
+import re
+
+def preprocess_names(name, output_sep=' ', firstname_output_letters=1):
+    """
+    Function that outputs a person's name in the format 
+    <last_name><separator><firstname letter(s)> (all lowercase)
+    
+    >>> preprocess_names("Samuel Eto'o")
+    'etoo s'
+   
+    >>> preprocess_names("Eto'o, Samuel")
+    'etoo s'
+    
+    >>> preprocess_names("Eto'o,Samuel")
+    'etoo s'
+    
+    >>> preprocess_names('Xavi')
+    'xavi'
+    
+    >>> preprocess_names('Yaya Touré')
+    'toure y'
+
+    >>> preprocess_names('José Ángel Pozo')
+    'pozo j'
+    
+    >>> preprocess_names('Pozo, José Ángel')
+    'pozo j'
+    
+    >>> preprocess_names('Pozo, José Ángel', firstname_output_letters=2)
+    'pozo jo'
+    
+    >>> preprocess_names("Eto'o, Samuel", firstname_output_letters=2)
+    'etoo sa'
+    
+    >>> preprocess_names("Eto'o, Samuel", firstname_output_letters=0)
+    'etoo'
+    
+    >>> preprocess_names("Eto'o, Samuel", output_sep=', ')
+    'etoo, s'
+    
+    """
+
+    # set first and last name positions
+    last, first = 'last', 'first'
+    last_pos = -1
+    
+    if ',' in name:
+        last, first = first, last
+        name = name.replace(',', ' ')
+        last_pos = 1
+        
+    spl = name.split()
+    if len(spl) > 2:
+        name = '%s %s' % (spl[0], spl[last_pos])    
+
+    spl1, *spl2 = name.split()
+    '%s %s' % (spl1, ''.join(spl2))
+        
+    # remove accents
+    name = ''.join(x for x in unicodedata.normalize('NFKD', name) if x in string.ascii_letters+' ')
+    
+    # get first and last name if applicable
+    m = re.match('(?P<first>\w+)\W+(?P<last>\w+)', name)
+    if m:
+        output = '%s%s%s' % (m.group(last), output_sep, m.group(first)[:firstname_output_letters])
+    else:
+        output = name
+    return output.lower().strip()
+    
+    
+if __name__ == "__main__":
+    import doctest
+    doctest.testmod()
\ No newline at end of file