# Public domain username sanitizer by Lenny Domnitser, 2007 # Found at http://domnit.org/software # # "It's less dumb about Unicode than most of what you see on the web" # -- self-proclaimed # # This would be a good place to list modifications to the original code: # * _____ import re import unicodedata def username(name, min_length=3, max_length=30): '''username(name) -> (what_you_put_in_the_db, what_you_display)''' name = unicode(name) name = name.strip() if min_length and len(name) < min_length or \ max_length and len(name) > max_length: raise ValueError('illegal username: bad length', 1) canonical_name = [] only_script = '' for c in name: cat0 = unicodedata.category(c)[0] char_name = unicodedata.name(c) if cat0 == 'L': # letters if cat0 == 'L': script = char_name.split(' ', 1)[0] if only_script and only_script != script: raise ValueError('illegal username: mixed scripts', 2) only_script = script canonical_name.append(c) elif cat0 == 'N': if c in '0123456789': simple_digits = c else: try: digit_name = re.findall(r'(?:DIGIT|NUMBER) ([A-Z]+)', char_name)[0] except IndexError: raise ValueError('illegal username: illegal characters', 3) simple_digits = {'ZERO': '0', 'ONE': '1', 'TWO': '2', 'THREE': '3', 'FOUR': '4', 'FIVE': '5', 'SIX': '6', 'SEVEN': '7', 'EIGHT': '8', 'NINE': '9', 'TEN': '10', 'ELEVEN': '11', 'TWELVE': '12', 'THIRTEEN': '13', 'FOURTEEN': '14', 'FIFTEEN': '15', 'SIXTEEN': '16', 'SEVENTEEN': '17', 'EIGHTEEN': '18', 'NINETEEN': '19', 'TWENTY': '20'}[digit_name] canonical_name.append(simple_digits) elif cat0 == 'P' or c == ' ': # numbers, punctuation, space pass else: raise ValueError('illegal username: illegal characters', 3) canonical_name = ''.join(canonical_name) canonical_name = canonical_name.lower() return canonical_name, name