You cannot select more than 25 topics
			Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
		
		
		
		
		
			
		
			
				
	
	
		
			133 lines
		
	
	
		
			4.7 KiB
		
	
	
	
		
			Python
		
	
			
		
		
	
	
			133 lines
		
	
	
		
			4.7 KiB
		
	
	
	
		
			Python
		
	
# We use native strings for all the re patterns, to take advantage of string
 | 
						|
# formatting, and then convert to bytestrings when compiling the final re
 | 
						|
# objects.
 | 
						|
 | 
						|
# https://svn.tools.ietf.org/svn/wg/httpbis/specs/rfc7230.html#whitespace
 | 
						|
#  OWS            = *( SP / HTAB )
 | 
						|
#                 ; optional whitespace
 | 
						|
OWS = r"[ \t]*"
 | 
						|
 | 
						|
# https://svn.tools.ietf.org/svn/wg/httpbis/specs/rfc7230.html#rule.token.separators
 | 
						|
#   token          = 1*tchar
 | 
						|
#
 | 
						|
#   tchar          = "!" / "#" / "$" / "%" / "&" / "'" / "*"
 | 
						|
#                  / "+" / "-" / "." / "^" / "_" / "`" / "|" / "~"
 | 
						|
#                  / DIGIT / ALPHA
 | 
						|
#                  ; any VCHAR, except delimiters
 | 
						|
token = r"[-!#$%&'*+.^_`|~0-9a-zA-Z]+"
 | 
						|
 | 
						|
# https://svn.tools.ietf.org/svn/wg/httpbis/specs/rfc7230.html#header.fields
 | 
						|
#  field-name     = token
 | 
						|
field_name = token
 | 
						|
 | 
						|
# The standard says:
 | 
						|
#
 | 
						|
#  field-value    = *( field-content / obs-fold )
 | 
						|
#  field-content  = field-vchar [ 1*( SP / HTAB ) field-vchar ]
 | 
						|
#  field-vchar    = VCHAR / obs-text
 | 
						|
#  obs-fold       = CRLF 1*( SP / HTAB )
 | 
						|
#                 ; obsolete line folding
 | 
						|
#                 ; see Section 3.2.4
 | 
						|
#
 | 
						|
# https://tools.ietf.org/html/rfc5234#appendix-B.1
 | 
						|
#
 | 
						|
#   VCHAR          =  %x21-7E
 | 
						|
#                  ; visible (printing) characters
 | 
						|
#
 | 
						|
# https://svn.tools.ietf.org/svn/wg/httpbis/specs/rfc7230.html#rule.quoted-string
 | 
						|
#   obs-text       = %x80-FF
 | 
						|
#
 | 
						|
# However, the standard definition of field-content is WRONG! It disallows
 | 
						|
# fields containing a single visible character surrounded by whitespace,
 | 
						|
# e.g. "foo a bar".
 | 
						|
#
 | 
						|
# See: https://www.rfc-editor.org/errata_search.php?rfc=7230&eid=4189
 | 
						|
#
 | 
						|
# So our definition of field_content attempts to fix it up...
 | 
						|
#
 | 
						|
# Also, we allow lots of control characters, because apparently people assume
 | 
						|
# that they're legal in practice (e.g., google analytics makes cookies with
 | 
						|
# \x01 in them!):
 | 
						|
#   https://github.com/python-hyper/h11/issues/57
 | 
						|
# We still don't allow NUL or whitespace, because those are often treated as
 | 
						|
# meta-characters and letting them through can lead to nasty issues like SSRF.
 | 
						|
vchar = r"[\x21-\x7e]"
 | 
						|
vchar_or_obs_text = r"[^\x00\s]"
 | 
						|
field_vchar = vchar_or_obs_text
 | 
						|
field_content = r"{field_vchar}+(?:[ \t]+{field_vchar}+)*".format(**globals())
 | 
						|
 | 
						|
# We handle obs-fold at a different level, and our fixed-up field_content
 | 
						|
# already grows to swallow the whole value, so ? instead of *
 | 
						|
field_value = r"({field_content})?".format(**globals())
 | 
						|
 | 
						|
#  header-field   = field-name ":" OWS field-value OWS
 | 
						|
header_field = (
 | 
						|
    r"(?P<field_name>{field_name})"
 | 
						|
    r":"
 | 
						|
    r"{OWS}"
 | 
						|
    r"(?P<field_value>{field_value})"
 | 
						|
    r"{OWS}".format(**globals())
 | 
						|
)
 | 
						|
 | 
						|
# https://svn.tools.ietf.org/svn/wg/httpbis/specs/rfc7230.html#request.line
 | 
						|
#
 | 
						|
#   request-line   = method SP request-target SP HTTP-version CRLF
 | 
						|
#   method         = token
 | 
						|
#   HTTP-version   = HTTP-name "/" DIGIT "." DIGIT
 | 
						|
#   HTTP-name      = %x48.54.54.50 ; "HTTP", case-sensitive
 | 
						|
#
 | 
						|
# request-target is complicated (see RFC 7230 sec 5.3) -- could be path, full
 | 
						|
# URL, host+port (for connect), or even "*", but in any case we are guaranteed
 | 
						|
# that it contists of the visible printing characters.
 | 
						|
method = token
 | 
						|
request_target = r"{vchar}+".format(**globals())
 | 
						|
http_version = r"HTTP/(?P<http_version>[0-9]\.[0-9])"
 | 
						|
request_line = (
 | 
						|
    r"(?P<method>{method})"
 | 
						|
    r" "
 | 
						|
    r"(?P<target>{request_target})"
 | 
						|
    r" "
 | 
						|
    r"{http_version}".format(**globals())
 | 
						|
)
 | 
						|
 | 
						|
# https://svn.tools.ietf.org/svn/wg/httpbis/specs/rfc7230.html#status.line
 | 
						|
#
 | 
						|
#   status-line = HTTP-version SP status-code SP reason-phrase CRLF
 | 
						|
#   status-code    = 3DIGIT
 | 
						|
#   reason-phrase  = *( HTAB / SP / VCHAR / obs-text )
 | 
						|
status_code = r"[0-9]{3}"
 | 
						|
reason_phrase = r"([ \t]|{vchar_or_obs_text})*".format(**globals())
 | 
						|
status_line = (
 | 
						|
    r"{http_version}"
 | 
						|
    r" "
 | 
						|
    r"(?P<status_code>{status_code})"
 | 
						|
    # However, there are apparently a few too many servers out there that just
 | 
						|
    # leave out the reason phrase:
 | 
						|
    #   https://github.com/scrapy/scrapy/issues/345#issuecomment-281756036
 | 
						|
    #   https://github.com/seanmonstar/httparse/issues/29
 | 
						|
    # so make it optional. ?: is a non-capturing group.
 | 
						|
    r"(?: (?P<reason>{reason_phrase}))?".format(**globals())
 | 
						|
)
 | 
						|
 | 
						|
HEXDIG = r"[0-9A-Fa-f]"
 | 
						|
# Actually
 | 
						|
#
 | 
						|
#      chunk-size     = 1*HEXDIG
 | 
						|
#
 | 
						|
# but we impose an upper-limit to avoid ridiculosity. len(str(2**64)) == 20
 | 
						|
chunk_size = r"({HEXDIG}){{1,20}}".format(**globals())
 | 
						|
# Actually
 | 
						|
#
 | 
						|
#     chunk-ext      = *( ";" chunk-ext-name [ "=" chunk-ext-val ] )
 | 
						|
#
 | 
						|
# but we aren't parsing the things so we don't really care.
 | 
						|
chunk_ext = r";.*"
 | 
						|
chunk_header = (
 | 
						|
    r"(?P<chunk_size>{chunk_size})"
 | 
						|
    r"(?P<chunk_ext>{chunk_ext})?"
 | 
						|
    r"{OWS}\r\n".format(
 | 
						|
        **globals()
 | 
						|
    )  # Even though the specification does not allow for extra whitespaces,
 | 
						|
    # we are lenient with trailing whitespaces because some servers on the wild use it.
 | 
						|
)
 |