| # -*- coding: utf-8 -*- |
| # Licensed under the Apache License, Version 2.0 (the "License"); |
| # you may not use this file except in compliance with the License. |
| # You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, software |
| # distributed under the License is distributed on an "AS IS" BASIS, |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or |
| # implied. |
| # See the License for the specific language governing permissions and |
| # limitations under the License. |
| """Module for the regular expressions crafted from ABNF.""" |
| |
| import sys |
| |
| # https://tools.ietf.org/html/rfc3986#page-13 |
| GEN_DELIMS = GENERIC_DELIMITERS = ":/?#[]@" |
| GENERIC_DELIMITERS_SET = set(GENERIC_DELIMITERS) |
| # https://tools.ietf.org/html/rfc3986#page-13 |
| SUB_DELIMS = SUB_DELIMITERS = "!$&'()*+,;=" |
| SUB_DELIMITERS_SET = set(SUB_DELIMITERS) |
| # Escape the '*' for use in regular expressions |
| SUB_DELIMITERS_RE = r"!$&'()\*+,;=" |
| RESERVED_CHARS_SET = GENERIC_DELIMITERS_SET.union(SUB_DELIMITERS_SET) |
| ALPHA = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz' |
| DIGIT = '0123456789' |
| # https://tools.ietf.org/html/rfc3986#section-2.3 |
| UNRESERVED = UNRESERVED_CHARS = ALPHA + DIGIT + r'._!-' |
| UNRESERVED_CHARS_SET = set(UNRESERVED_CHARS) |
| NON_PCT_ENCODED_SET = RESERVED_CHARS_SET.union(UNRESERVED_CHARS_SET) |
| # We need to escape the '-' in this case: |
| UNRESERVED_RE = r'A-Za-z0-9._~\-' |
| |
| # Percent encoded character values |
| PERCENT_ENCODED = PCT_ENCODED = '%[A-Fa-f0-9]{2}' |
| PCHAR = '([' + UNRESERVED_RE + SUB_DELIMITERS_RE + ':@]|%s)' % PCT_ENCODED |
| |
| # NOTE(sigmavirus24): We're going to use more strict regular expressions |
| # than appear in Appendix B for scheme. This will prevent over-eager |
| # consuming of items that aren't schemes. |
| SCHEME_RE = '[a-zA-Z][a-zA-Z0-9+.-]*' |
| _AUTHORITY_RE = '[^/?#]*' |
| _PATH_RE = '[^?#]*' |
| _QUERY_RE = '[^#]*' |
| _FRAGMENT_RE = '.*' |
| |
| # Extracted from http://tools.ietf.org/html/rfc3986#appendix-B |
| COMPONENT_PATTERN_DICT = { |
| 'scheme': SCHEME_RE, |
| 'authority': _AUTHORITY_RE, |
| 'path': _PATH_RE, |
| 'query': _QUERY_RE, |
| 'fragment': _FRAGMENT_RE, |
| } |
| |
| # See http://tools.ietf.org/html/rfc3986#appendix-B |
| # In this case, we name each of the important matches so we can use |
| # SRE_Match#groupdict to parse the values out if we so choose. This is also |
| # modified to ignore other matches that are not important to the parsing of |
| # the reference so we can also simply use SRE_Match#groups. |
| URL_PARSING_RE = ( |
| r'(?:(?P<scheme>{scheme}):)?(?://(?P<authority>{authority}))?' |
| r'(?P<path>{path})(?:\?(?P<query>{query}))?' |
| r'(?:#(?P<fragment>{fragment}))?' |
| ).format(**COMPONENT_PATTERN_DICT) |
| |
| |
| # ######################### |
| # Authority Matcher Section |
| # ######################### |
| |
| # Host patterns, see: http://tools.ietf.org/html/rfc3986#section-3.2.2 |
| # The pattern for a regular name, e.g., www.google.com, api.github.com |
| REGULAR_NAME_RE = REG_NAME = '((?:{0}|[{1}])*)'.format( |
| '%[0-9A-Fa-f]{2}', SUB_DELIMITERS_RE + UNRESERVED_RE |
| ) |
| # The pattern for an IPv4 address, e.g., 192.168.255.255, 127.0.0.1, |
| IPv4_RE = r'([0-9]{1,3}\.){3}[0-9]{1,3}' |
| # Hexadecimal characters used in each piece of an IPv6 address |
| HEXDIG_RE = '[0-9A-Fa-f]{1,4}' |
| # Least-significant 32 bits of an IPv6 address |
| LS32_RE = '({hex}:{hex}|{ipv4})'.format(hex=HEXDIG_RE, ipv4=IPv4_RE) |
| # Substitutions into the following patterns for IPv6 patterns defined |
| # http://tools.ietf.org/html/rfc3986#page-20 |
| _subs = {'hex': HEXDIG_RE, 'ls32': LS32_RE} |
| |
| # Below: h16 = hexdig, see: https://tools.ietf.org/html/rfc5234 for details |
| # about ABNF (Augmented Backus-Naur Form) use in the comments |
| variations = [ |
| # 6( h16 ":" ) ls32 |
| '(%(hex)s:){6}%(ls32)s' % _subs, |
| # "::" 5( h16 ":" ) ls32 |
| '::(%(hex)s:){5}%(ls32)s' % _subs, |
| # [ h16 ] "::" 4( h16 ":" ) ls32 |
| '(%(hex)s)?::(%(hex)s:){4}%(ls32)s' % _subs, |
| # [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32 |
| '((%(hex)s:)?%(hex)s)?::(%(hex)s:){3}%(ls32)s' % _subs, |
| # [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32 |
| '((%(hex)s:){0,2}%(hex)s)?::(%(hex)s:){2}%(ls32)s' % _subs, |
| # [ *3( h16 ":" ) h16 ] "::" h16 ":" ls32 |
| '((%(hex)s:){0,3}%(hex)s)?::%(hex)s:%(ls32)s' % _subs, |
| # [ *4( h16 ":" ) h16 ] "::" ls32 |
| '((%(hex)s:){0,4}%(hex)s)?::%(ls32)s' % _subs, |
| # [ *5( h16 ":" ) h16 ] "::" h16 |
| '((%(hex)s:){0,5}%(hex)s)?::%(hex)s' % _subs, |
| # [ *6( h16 ":" ) h16 ] "::" |
| '((%(hex)s:){0,6}%(hex)s)?::' % _subs, |
| ] |
| |
| IPv6_RE = '(({0})|({1})|({2})|({3})|({4})|({5})|({6})|({7})|({8}))'.format( |
| *variations |
| ) |
| |
| IPv_FUTURE_RE = r'v[0-9A-Fa-f]+\.[%s]+' % ( |
| UNRESERVED_RE + SUB_DELIMITERS_RE + ':' |
| ) |
| |
| # RFC 6874 Zone ID ABNF |
| ZONE_ID = '(?:[' + UNRESERVED_RE + ']|' + PCT_ENCODED + ')+' |
| |
| IPv6_ADDRZ_RFC4007_RE = IPv6_RE + '(?:(?:%25|%)' + ZONE_ID + ')?' |
| IPv6_ADDRZ_RE = IPv6_RE + '(?:%25' + ZONE_ID + ')?' |
| |
| IP_LITERAL_RE = r'\[({0}|{1})\]'.format( |
| IPv6_ADDRZ_RFC4007_RE, |
| IPv_FUTURE_RE, |
| ) |
| |
| # Pattern for matching the host piece of the authority |
| HOST_RE = HOST_PATTERN = '({0}|{1}|{2})'.format( |
| REG_NAME, |
| IPv4_RE, |
| IP_LITERAL_RE, |
| ) |
| USERINFO_RE = '^([' + UNRESERVED_RE + SUB_DELIMITERS_RE + ':]|%s)+' % ( |
| PCT_ENCODED |
| ) |
| PORT_RE = '[0-9]{1,5}' |
| |
| # #################### |
| # Path Matcher Section |
| # #################### |
| |
| # See http://tools.ietf.org/html/rfc3986#section-3.3 for more information |
| # about the path patterns defined below. |
| segments = { |
| 'segment': PCHAR + '*', |
| # Non-zero length segment |
| 'segment-nz': PCHAR + '+', |
| # Non-zero length segment without ":" |
| 'segment-nz-nc': PCHAR.replace(':', '') + '+' |
| } |
| |
| # Path types taken from Section 3.3 (linked above) |
| PATH_EMPTY = '^$' |
| PATH_ROOTLESS = '%(segment-nz)s(/%(segment)s)*' % segments |
| PATH_NOSCHEME = '%(segment-nz-nc)s(/%(segment)s)*' % segments |
| PATH_ABSOLUTE = '/(%s)?' % PATH_ROOTLESS |
| PATH_ABEMPTY = '(/%(segment)s)*' % segments |
| PATH_RE = '^(%s|%s|%s|%s|%s)$' % ( |
| PATH_ABEMPTY, PATH_ABSOLUTE, PATH_NOSCHEME, PATH_ROOTLESS, PATH_EMPTY |
| ) |
| |
| FRAGMENT_RE = QUERY_RE = ( |
| '^([/?:@' + UNRESERVED_RE + SUB_DELIMITERS_RE + ']|%s)*$' % PCT_ENCODED |
| ) |
| |
| # ########################## |
| # Relative reference matcher |
| # ########################## |
| |
| # See http://tools.ietf.org/html/rfc3986#section-4.2 for details |
| RELATIVE_PART_RE = '(//%s%s|%s|%s|%s)' % ( |
| COMPONENT_PATTERN_DICT['authority'], |
| PATH_ABEMPTY, |
| PATH_ABSOLUTE, |
| PATH_NOSCHEME, |
| PATH_EMPTY, |
| ) |
| |
| # See http://tools.ietf.org/html/rfc3986#section-3 for definition |
| HIER_PART_RE = '(//%s%s|%s|%s|%s)' % ( |
| COMPONENT_PATTERN_DICT['authority'], |
| PATH_ABEMPTY, |
| PATH_ABSOLUTE, |
| PATH_ROOTLESS, |
| PATH_EMPTY, |
| ) |
| |
| # ############### |
| # IRIs / RFC 3987 |
| # ############### |
| |
| # Only wide-unicode gets the high-ranges of UCSCHAR |
| if sys.maxunicode > 0xFFFF: # pragma: no cover |
| IPRIVATE = u'\uE000-\uF8FF\U000F0000-\U000FFFFD\U00100000-\U0010FFFD' |
| UCSCHAR_RE = ( |
| u'\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF' |
| u'\U00010000-\U0001FFFD\U00020000-\U0002FFFD' |
| u'\U00030000-\U0003FFFD\U00040000-\U0004FFFD' |
| u'\U00050000-\U0005FFFD\U00060000-\U0006FFFD' |
| u'\U00070000-\U0007FFFD\U00080000-\U0008FFFD' |
| u'\U00090000-\U0009FFFD\U000A0000-\U000AFFFD' |
| u'\U000B0000-\U000BFFFD\U000C0000-\U000CFFFD' |
| u'\U000D0000-\U000DFFFD\U000E1000-\U000EFFFD' |
| ) |
| else: # pragma: no cover |
| IPRIVATE = u'\uE000-\uF8FF' |
| UCSCHAR_RE = ( |
| u'\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF' |
| ) |
| |
| IUNRESERVED_RE = u'A-Za-z0-9\\._~\\-' + UCSCHAR_RE |
| IPCHAR = u'([' + IUNRESERVED_RE + SUB_DELIMITERS_RE + u':@]|%s)' % PCT_ENCODED |
| |
| isegments = { |
| 'isegment': IPCHAR + u'*', |
| # Non-zero length segment |
| 'isegment-nz': IPCHAR + u'+', |
| # Non-zero length segment without ":" |
| 'isegment-nz-nc': IPCHAR.replace(':', '') + u'+' |
| } |
| |
| IPATH_ROOTLESS = u'%(isegment-nz)s(/%(isegment)s)*' % isegments |
| IPATH_NOSCHEME = u'%(isegment-nz-nc)s(/%(isegment)s)*' % isegments |
| IPATH_ABSOLUTE = u'/(?:%s)?' % IPATH_ROOTLESS |
| IPATH_ABEMPTY = u'(?:/%(isegment)s)*' % isegments |
| IPATH_RE = u'^(?:%s|%s|%s|%s|%s)$' % ( |
| IPATH_ABEMPTY, IPATH_ABSOLUTE, IPATH_NOSCHEME, IPATH_ROOTLESS, PATH_EMPTY |
| ) |
| |
| IREGULAR_NAME_RE = IREG_NAME = u'(?:{0}|[{1}])*'.format( |
| u'%[0-9A-Fa-f]{2}', SUB_DELIMITERS_RE + IUNRESERVED_RE |
| ) |
| |
| IHOST_RE = IHOST_PATTERN = u'({0}|{1}|{2})'.format( |
| IREG_NAME, |
| IPv4_RE, |
| IP_LITERAL_RE, |
| ) |
| |
| IUSERINFO_RE = u'^(?:[' + IUNRESERVED_RE + SUB_DELIMITERS_RE + u':]|%s)+' % ( |
| PCT_ENCODED |
| ) |
| |
| IFRAGMENT_RE = (u'^(?:[/?:@' + IUNRESERVED_RE + SUB_DELIMITERS_RE |
| + u']|%s)*$' % PCT_ENCODED) |
| IQUERY_RE = (u'^(?:[/?:@' + IUNRESERVED_RE + SUB_DELIMITERS_RE |
| + IPRIVATE + u']|%s)*$' % PCT_ENCODED) |
| |
| IRELATIVE_PART_RE = u'(//%s%s|%s|%s|%s)' % ( |
| COMPONENT_PATTERN_DICT['authority'], |
| IPATH_ABEMPTY, |
| IPATH_ABSOLUTE, |
| IPATH_NOSCHEME, |
| PATH_EMPTY, |
| ) |
| |
| IHIER_PART_RE = u'(//%s%s|%s|%s|%s)' % ( |
| COMPONENT_PATTERN_DICT['authority'], |
| IPATH_ABEMPTY, |
| IPATH_ABSOLUTE, |
| IPATH_ROOTLESS, |
| PATH_EMPTY, |
| ) |