|
Revision 96:2bd4c063e75e, 2.4 kB
(checked in by Stefan Schwarzer <sschwarzer@sschwarzer.net>, 1 year ago)
|
Don't execute anything when imported as script. Tests are now handled
by test.py . |
| Line | |
|---|
| 1 |
|
|---|
| 2 |
|
|---|
| 3 |
|
|---|
| 4 |
|
|---|
| 5 |
|
|---|
| 6 |
|
|---|
| 7 |
|
|---|
| 8 |
|
|---|
| 9 |
|
|---|
| 10 |
|
|---|
| 11 |
|
|---|
| 12 |
|
|---|
| 13 |
|
|---|
| 14 |
|
|---|
| 15 |
|
|---|
| 16 |
|
|---|
| 17 |
|
|---|
| 18 |
|
|---|
| 19 |
|
|---|
| 20 |
|
|---|
| 21 |
|
|---|
| 22 |
|
|---|
| 23 |
|
|---|
| 24 |
""" |
|---|
| 25 |
Tools for converting between unicode strings and byte strings. |
|---|
| 26 |
""" |
|---|
| 27 |
|
|---|
| 28 |
|
|---|
| 29 |
DEFAULT_ENCODING = "UTF-8" |
|---|
| 30 |
|
|---|
| 31 |
|
|---|
| 32 |
FALLBACK_ENCODING = "iso-8859-15" |
|---|
| 33 |
|
|---|
| 34 |
def encode(unicode_string): |
|---|
| 35 |
r""" |
|---|
| 36 |
Return a byte string for the given `unicode_string`. Use the |
|---|
| 37 |
`DEFAULT_ENCODING`. |
|---|
| 38 |
|
|---|
| 39 |
>>> import unicodedata |
|---|
| 40 |
>>> encode(u"abc") |
|---|
| 41 |
'abc' |
|---|
| 42 |
>>> encode(unicodedata.lookup('LATIN SMALL LETTER A WITH DIAERESIS') |
|---|
| 43 |
... + u"bc") |
|---|
| 44 |
'\xc3\xa4bc' |
|---|
| 45 |
""" |
|---|
| 46 |
return unicode_string.encode(DEFAULT_ENCODING, 'replace') |
|---|
| 47 |
|
|---|
| 48 |
def decode(byte_string): |
|---|
| 49 |
r""" |
|---|
| 50 |
Return a unicode string for the given `byte_string`. If the |
|---|
| 51 |
argument can't be decoded, decode with the `FALLBACK_ENCODING` |
|---|
| 52 |
instead. |
|---|
| 53 |
|
|---|
| 54 |
>>> import unicodedata |
|---|
| 55 |
>>> s = "\xc3\xa4bc" # UTF-8, umlaut encoded in two bytes |
|---|
| 56 |
>>> u = decode(s) |
|---|
| 57 |
>>> len(u), unicodedata.name(u[0]) |
|---|
| 58 |
(3, 'LATIN SMALL LETTER A WITH DIAERESIS') |
|---|
| 59 |
|
|---|
| 60 |
>>> s = "\xe4bc" # ISO-8859-1, umlaut encoded in one byte |
|---|
| 61 |
>>> u = decode(s) |
|---|
| 62 |
>>> len(u), unicodedata.name(u[0]) |
|---|
| 63 |
(3, 'LATIN SMALL LETTER A WITH DIAERESIS') |
|---|
| 64 |
""" |
|---|
| 65 |
try: |
|---|
| 66 |
return byte_string.decode(DEFAULT_ENCODING) |
|---|
| 67 |
except UnicodeDecodeError: |
|---|
| 68 |
|
|---|
| 69 |
return byte_string.decode(FALLBACK_ENCODING) |
|---|
| 70 |
|
|---|