root/coding.py

Revision 96:2bd4c063e75e, 2.4 kB (checked in by Stefan Schwarzer <sschwarzer@sschwarzer.net>, 1 year ago)
Don't execute anything when imported as script. Tests are now handled
by test.py .
Line 
1 # coding: UTF-8
2 # Copyright (C) 2007, Stefan Schwarzer
3 #
4 # Permission is hereby granted, free of charge, to any person
5 # obtaining a copy of this software and associated documentation files
6 # (the "Software"), to deal in the Software without restriction,
7 # including without limitation the rights to use, copy, modify, merge,
8 # publish, distribute, sublicense, and/or sell copies of the Software,
9 # and to permit persons to whom the Software is furnished to do so,
10 # subject to the following conditions:
11 #
12 # The above copyright notice and this permission notice shall be
13 # included in all copies or substantial portions of the Software.
14 #
15 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 # SOFTWARE.
23
24 """
25 Tools for converting between unicode strings and byte strings.
26 """
27
28 # assume this encoding for all file names and their contents
29 DEFAULT_ENCODING = "UTF-8"
30
31 # the fallback encoding should be an 8-bit encoding
32 FALLBACK_ENCODING = "iso-8859-15"
33
34 def encode(unicode_string):
35     r"""
36     Return a byte string for the given `unicode_string`. Use the
37     `DEFAULT_ENCODING`.
38
39     >>> import unicodedata
40     >>> encode(u"abc")
41     'abc'
42     >>> encode(unicodedata.lookup('LATIN SMALL LETTER A WITH DIAERESIS')
43     ... + u"bc")
44     '\xc3\xa4bc'
45     """
46     return unicode_string.encode(DEFAULT_ENCODING, 'replace')
47
48 def decode(byte_string):
49     r"""
50     Return a unicode string for the given `byte_string`. If the
51     argument can't be decoded, decode with the `FALLBACK_ENCODING`
52     instead.
53
54     >>> import unicodedata
55     >>> s = "\xc3\xa4bc"  # UTF-8, umlaut encoded in two bytes
56     >>> u = decode(s)
57     >>> len(u), unicodedata.name(u[0])
58     (3, 'LATIN SMALL LETTER A WITH DIAERESIS')
59
60     >>> s = "\xe4bc"  # ISO-8859-1, umlaut encoded in one byte
61     >>> u = decode(s)
62     >>> len(u), unicodedata.name(u[0])
63     (3, 'LATIN SMALL LETTER A WITH DIAERESIS')
64     """
65     try:
66         return byte_string.decode(DEFAULT_ENCODING)
67     except UnicodeDecodeError:
68         # 8-bit encoding, should always work
69         return byte_string.decode(FALLBACK_ENCODING)
70
Note: See TracBrowser for help on using the browser.