| # This module implements the RFCs 3490 (IDNA) and 3491 (Nameprep) |
| |
| import stringprep, re, codecs |
| from unicodedata import ucd_3_2_0 as unicodedata |
| |
| # IDNA section 3.1 |
| dots = re.compile("[\u002E\u3002\uFF0E\uFF61]") |
| |
| # IDNA section 5 |
| ace_prefix = b"xn--" |
| sace_prefix = "xn--" |
| |
| # This assumes query strings, so AllowUnassigned is true |
| def nameprep(label): # type: (str) -> str |
| # Map |
| newlabel = [] |
| for c in label: |
| if stringprep.in_table_b1(c): |
| # Map to nothing |
| continue |
| newlabel.append(stringprep.map_table_b2(c)) |
| label = "".join(newlabel) |
| |
| # Normalize |
| label = unicodedata.normalize("NFKC", label) |
| |
| # Prohibit |
| for i, c in enumerate(label): |
| if stringprep.in_table_c12(c) or \ |
| stringprep.in_table_c22(c) or \ |
| stringprep.in_table_c3(c) or \ |
| stringprep.in_table_c4(c) or \ |
| stringprep.in_table_c5(c) or \ |
| stringprep.in_table_c6(c) or \ |
| stringprep.in_table_c7(c) or \ |
| stringprep.in_table_c8(c) or \ |
| stringprep.in_table_c9(c): |
| raise UnicodeEncodeError("idna", label, i, i+1, f"Invalid character {c!r}") |
| |
| # Check bidi |
| RandAL = [stringprep.in_table_d1(x) for x in label] |
| if any(RandAL): |
| # There is a RandAL char in the string. Must perform further |
| # tests: |
| # 1) The characters in section 5.8 MUST be prohibited. |
| # This is table C.8, which was already checked |
| # 2) If a string contains any RandALCat character, the string |
| # MUST NOT contain any LCat character. |
| for i, x in enumerate(label): |
| if stringprep.in_table_d2(x): |
| raise UnicodeEncodeError("idna", label, i, i+1, |
| "Violation of BIDI requirement 2") |
| # 3) If a string contains any RandALCat character, a |
| # RandALCat character MUST be the first character of the |
| # string, and a RandALCat character MUST be the last |
| # character of the string. |
| if not RandAL[0]: |
| raise UnicodeEncodeError("idna", label, 0, 1, |
| "Violation of BIDI requirement 3") |
| if not RandAL[-1]: |
| raise UnicodeEncodeError("idna", label, len(label)-1, len(label), |
| "Violation of BIDI requirement 3") |
| |
| return label |
| |
| def ToASCII(label): # type: (str) -> bytes |
| try: |
| # Step 1: try ASCII |
| label_ascii = label.encode("ascii") |
| except UnicodeEncodeError: |
| pass |
| else: |
| # Skip to step 3: UseSTD3ASCIIRules is false, so |
| # Skip to step 8. |
| if 0 < len(label_ascii) < 64: |
| return label_ascii |
| if len(label) == 0: |
| raise UnicodeEncodeError("idna", label, 0, 1, "label empty") |
| else: |
| raise UnicodeEncodeError("idna", label, 0, len(label), "label too long") |
| |
| # Step 2: nameprep |
| label = nameprep(label) |
| |
| # Step 3: UseSTD3ASCIIRules is false |
| # Step 4: try ASCII |
| try: |
| label_ascii = label.encode("ascii") |
| except UnicodeEncodeError: |
| pass |
| else: |
| # Skip to step 8. |
| if 0 < len(label) < 64: |
| return label_ascii |
| if len(label) == 0: |
| raise UnicodeEncodeError("idna", label, 0, 1, "label empty") |
| else: |
| raise UnicodeEncodeError("idna", label, 0, len(label), "label too long") |
| |
| # Step 5: Check ACE prefix |
| if label.lower().startswith(sace_prefix): |
| raise UnicodeEncodeError( |
| "idna", label, 0, len(sace_prefix), "Label starts with ACE prefix") |
| |
| # Step 6: Encode with PUNYCODE |
| label_ascii = label.encode("punycode") |
| |
| # Step 7: Prepend ACE prefix |
| label_ascii = ace_prefix + label_ascii |
| |
| # Step 8: Check size |
| # do not check for empty as we prepend ace_prefix. |
| if len(label_ascii) < 64: |
| return label_ascii |
| raise UnicodeEncodeError("idna", label, 0, len(label), "label too long") |
| |
| def ToUnicode(label): |
| if len(label) > 1024: |
| # Protection from https://github.com/python/cpython/issues/98433. |
| # https://datatracker.ietf.org/doc/html/rfc5894#section-6 |
| # doesn't specify a label size limit prior to NAMEPREP. But having |
| # one makes practical sense. |
| # This leaves ample room for nameprep() to remove Nothing characters |
| # per https://www.rfc-editor.org/rfc/rfc3454#section-3.1 while still |
| # preventing us from wasting time decoding a big thing that'll just |
| # hit the actual <= 63 length limit in Step 6. |
| if isinstance(label, str): |
| label = label.encode("utf-8", errors="backslashreplace") |
| raise UnicodeDecodeError("idna", label, 0, len(label), "label way too long") |
| # Step 1: Check for ASCII |
| if isinstance(label, bytes): |
| pure_ascii = True |
| else: |
| try: |
| label = label.encode("ascii") |
| pure_ascii = True |
| except UnicodeEncodeError: |
| pure_ascii = False |
| if not pure_ascii: |
| assert isinstance(label, str) |
| # Step 2: Perform nameprep |
| label = nameprep(label) |
| # It doesn't say this, but apparently, it should be ASCII now |
| try: |
| label = label.encode("ascii") |
| except UnicodeEncodeError as exc: |
| raise UnicodeEncodeError("idna", label, exc.start, exc.end, |
| "Invalid character in IDN label") |
| # Step 3: Check for ACE prefix |
| assert isinstance(label, bytes) |
| if not label.lower().startswith(ace_prefix): |
| return str(label, "ascii") |
| |
| # Step 4: Remove ACE prefix |
| label1 = label[len(ace_prefix):] |
| |
| # Step 5: Decode using PUNYCODE |
| try: |
| result = label1.decode("punycode") |
| except UnicodeDecodeError as exc: |
| offset = len(ace_prefix) |
| raise UnicodeDecodeError("idna", label, offset+exc.start, offset+exc.end, exc.reason) |
| |
| # Step 6: Apply ToASCII |
| label2 = ToASCII(result) |
| |
| # Step 7: Compare the result of step 6 with the one of step 3 |
| # label2 will already be in lower case. |
| if str(label, "ascii").lower() != str(label2, "ascii"): |
| raise UnicodeDecodeError("idna", label, 0, len(label), |
| f"IDNA does not round-trip, '{label!r}' != '{label2!r}'") |
| |
| # Step 8: return the result of step 5 |
| return result |
| |
| ### Codec APIs |
| |
| class Codec(codecs.Codec): |
| def encode(self, input, errors='strict'): |
| |
| if errors != 'strict': |
| # IDNA is quite clear that implementations must be strict |
| raise UnicodeError(f"Unsupported error handling: {errors}") |
| |
| if not input: |
| return b'', 0 |
| |
| try: |
| result = input.encode('ascii') |
| except UnicodeEncodeError: |
| pass |
| else: |
| # ASCII name: fast path |
| labels = result.split(b'.') |
| for i, label in enumerate(labels[:-1]): |
| if len(label) == 0: |
| offset = sum(len(l) for l in labels[:i]) + i |
| raise UnicodeEncodeError("idna", input, offset, offset+1, |
| "label empty") |
| for i, label in enumerate(labels): |
| if len(label) >= 64: |
| offset = sum(len(l) for l in labels[:i]) + i |
| raise UnicodeEncodeError("idna", input, offset, offset+len(label), |
| "label too long") |
| return result, len(input) |
| |
| result = bytearray() |
| labels = dots.split(input) |
| if labels and not labels[-1]: |
| trailing_dot = b'.' |
| del labels[-1] |
| else: |
| trailing_dot = b'' |
| for i, label in enumerate(labels): |
| if result: |
| # Join with U+002E |
| result.extend(b'.') |
| try: |
| result.extend(ToASCII(label)) |
| except (UnicodeEncodeError, UnicodeDecodeError) as exc: |
| offset = sum(len(l) for l in labels[:i]) + i |
| raise UnicodeEncodeError( |
| "idna", |
| input, |
| offset + exc.start, |
| offset + exc.end, |
| exc.reason, |
| ) |
| return bytes(result+trailing_dot), len(input) |
| |
| def decode(self, input, errors='strict'): |
| |
| if errors != 'strict': |
| raise UnicodeError(f"Unsupported error handling: {errors}") |
| |
| if not input: |
| return "", 0 |
| |
| # IDNA allows decoding to operate on Unicode strings, too. |
| if not isinstance(input, bytes): |
| # XXX obviously wrong, see #3232 |
| input = bytes(input) |
| |
| if ace_prefix not in input.lower(): |
| # Fast path |
| try: |
| return input.decode('ascii'), len(input) |
| except UnicodeDecodeError: |
| pass |
| |
| labels = input.split(b".") |
| |
| if labels and len(labels[-1]) == 0: |
| trailing_dot = '.' |
| del labels[-1] |
| else: |
| trailing_dot = '' |
| |
| result = [] |
| for i, label in enumerate(labels): |
| try: |
| u_label = ToUnicode(label) |
| except (UnicodeEncodeError, UnicodeDecodeError) as exc: |
| offset = sum(len(x) for x in labels[:i]) + len(labels[:i]) |
| raise UnicodeDecodeError( |
| "idna", input, offset+exc.start, offset+exc.end, exc.reason) |
| else: |
| result.append(u_label) |
| |
| return ".".join(result)+trailing_dot, len(input) |
| |
| class IncrementalEncoder(codecs.BufferedIncrementalEncoder): |
| def _buffer_encode(self, input, errors, final): |
| if errors != 'strict': |
| # IDNA is quite clear that implementations must be strict |
| raise UnicodeError(f"Unsupported error handling: {errors}") |
| |
| if not input: |
| return (b'', 0) |
| |
| labels = dots.split(input) |
| trailing_dot = b'' |
| if labels: |
| if not labels[-1]: |
| trailing_dot = b'.' |
| del labels[-1] |
| elif not final: |
| # Keep potentially unfinished label until the next call |
| del labels[-1] |
| if labels: |
| trailing_dot = b'.' |
| |
| result = bytearray() |
| size = 0 |
| for label in labels: |
| if size: |
| # Join with U+002E |
| result.extend(b'.') |
| size += 1 |
| try: |
| result.extend(ToASCII(label)) |
| except (UnicodeEncodeError, UnicodeDecodeError) as exc: |
| raise UnicodeEncodeError( |
| "idna", |
| input, |
| size + exc.start, |
| size + exc.end, |
| exc.reason, |
| ) |
| size += len(label) |
| |
| result += trailing_dot |
| size += len(trailing_dot) |
| return (bytes(result), size) |
| |
| class IncrementalDecoder(codecs.BufferedIncrementalDecoder): |
| def _buffer_decode(self, input, errors, final): |
| if errors != 'strict': |
| raise UnicodeError("Unsupported error handling: {errors}") |
| |
| if not input: |
| return ("", 0) |
| |
| # IDNA allows decoding to operate on Unicode strings, too. |
| if isinstance(input, str): |
| labels = dots.split(input) |
| else: |
| # Must be ASCII string |
| try: |
| input = str(input, "ascii") |
| except (UnicodeEncodeError, UnicodeDecodeError) as exc: |
| raise UnicodeDecodeError("idna", input, |
| exc.start, exc.end, exc.reason) |
| labels = input.split(".") |
| |
| trailing_dot = '' |
| if labels: |
| if not labels[-1]: |
| trailing_dot = '.' |
| del labels[-1] |
| elif not final: |
| # Keep potentially unfinished label until the next call |
| del labels[-1] |
| if labels: |
| trailing_dot = '.' |
| |
| result = [] |
| size = 0 |
| for label in labels: |
| try: |
| u_label = ToUnicode(label) |
| except (UnicodeEncodeError, UnicodeDecodeError) as exc: |
| raise UnicodeDecodeError( |
| "idna", |
| input.encode("ascii", errors="backslashreplace"), |
| size + exc.start, |
| size + exc.end, |
| exc.reason, |
| ) |
| else: |
| result.append(u_label) |
| if size: |
| size += 1 |
| size += len(label) |
| |
| result = ".".join(result) + trailing_dot |
| size += len(trailing_dot) |
| return (result, size) |
| |
| class StreamWriter(Codec,codecs.StreamWriter): |
| pass |
| |
| class StreamReader(Codec,codecs.StreamReader): |
| pass |
| |
| ### encodings module API |
| |
| def getregentry(): |
| return codecs.CodecInfo( |
| name='idna', |
| encode=Codec().encode, |
| decode=Codec().decode, |
| incrementalencoder=IncrementalEncoder, |
| incrementaldecoder=IncrementalDecoder, |
| streamwriter=StreamWriter, |
| streamreader=StreamReader, |
| ) |