# Examples for using Python's Regular expression module "re" # sr 11/30/2013 import re '''OVERVIEW '*' matches all characters that follow (0 or more) '+' matches all characters that follow (1 or more) '?' makes the previous character optional '{4}' previous character must match exactly 4 times '{2-4}' previous character must match exactly 2-4 times '[0-9]' matches all characters in the set of numbers 0 to 9 '[A-Z]' matches all characters in the set of A to Z '\d' matches all digits, e.g., '4', '9' ... '\D' matches all NON-digit characters '\s' matches all space characters: '', '\t', '\r', '\n' '\S' matches all NON-space characters '\w' matches all non-punctuation characters (i.e., letters and digits) '\W' matches all NON-letter and NON-digit characters '^bla' NOT-matches 'bla' 'let$' matches 'let' but not 'letter' '\b' matches transition between non-word characters and word characters ''' data = '''2013-01-01 2012-02-02 aaaa-02-02 aa-02-02 -04-04 2000 02-02 ghi stu 2012-03-03'''.strip().split('\n') # A >> '*' matches all characters that follow (0 or more) print (50*'-' + '\nA\n' + 50*'-') for line in data: match = re.search('(.*)-(..)-(..)', line) # note the parantheses if match: print(match.group(1), match.group(2), match.group(3)) ''' -------------------------------------------------- A -------------------------------------------------- 2013 01 01 2012 02 02 aaaa 02 02 aa 02 02 04 04 2012 03 03 ''' # B >> '+' matches all characters that follow (1 or more) print (50*'-' + '\nB\n' + 50*'-') for line in data: match = re.search('(.+)-(..)-(..)', line) # note the parantheses if match: print(match.group(1), match.group(2), match.group(3)) ''' -------------------------------------------------- B -------------------------------------------------- 2013 01 01 2012 02 02 aaaa 02 02 aa 02 02 2012 03 03 ''' # C >> '?' makes the previous character optional print (50*'-' + '\nC\n' + 50*'-') for line in data: match = re.search('(.+)-?(..)-(..)', line) # note the parantheses if match: print(match.group(1), match.group(2), match.group(3)) ''' -------------------------------------------------- C -------------------------------------------------- 2013- 01 01 2012- 02 02 aaaa- 02 02 aa- 02 02 - 04 04 2000 02 02 2012- 03 03 ''' # D >> '{4}' previous character must match exactly 4 times print (50*'-' + '\nD\n' + 50*'-') for line in data: match = re.search('(a{4})-(..)-(..)', line) # note the parantheses if match: print(match.group(1), match.group(2), match.group(3)) ''' -------------------------------------------------- D -------------------------------------------------- aaaa 02 02 ''' # E >>'{2-4}' previous character must match exactly 2-4 times print (50*'-' + '\nE\n' + 50*'-') for line in data: match = re.search('(a{2,4})-(..)-(..)', line) # note the parantheses if match: print(match.group(1), match.group(2), match.group(3)) ''' -------------------------------------------------- E -------------------------------------------------- aaaa 02 02 aa 02 02 '''