Loading django/core/handlers/wsgi.py +4 −7 Original line number Diff line number Diff line Loading @@ -206,7 +206,6 @@ def get_path_info(environ): """ path_info = get_bytes_from_wsgi(environ, 'PATH_INFO', '/') # It'd be better to implement URI-to-IRI decoding, see #19508. return path_info.decode(UTF_8) Loading Loading @@ -236,7 +235,6 @@ def get_script_name(environ): else: script_name = get_bytes_from_wsgi(environ, 'SCRIPT_NAME', '') # It'd be better to implement URI-to-IRI decoding, see #19508. return script_name.decode(UTF_8) Loading @@ -251,16 +249,15 @@ def get_bytes_from_wsgi(environ, key, default): # Under Python 3, non-ASCII values in the WSGI environ are arbitrarily # decoded with ISO-8859-1. This is wrong for Django websites where UTF-8 # is the default. Re-encode to recover the original bytestring. return value if six.PY2 else value.encode(ISO_8859_1) return value.encode(ISO_8859_1) if six.PY3 else value def get_str_from_wsgi(environ, key, default): """ Get a value from the WSGI environ dictionary as bytes. Get a value from the WSGI environ dictionary as str. key and default should be str objects. Under Python 2 they may also be unicode objects provided they only contain ASCII characters. """ value = environ.get(str(key), str(default)) # Same comment as above return value if six.PY2 else value.encode(ISO_8859_1).decode(UTF_8, errors='replace') value = get_bytes_from_wsgi(environ, key, default) return value.decode(UTF_8, errors='replace') if six.PY3 else value django/core/servers/basehttp.py +17 −0 Original line number Diff line number Diff line Loading @@ -15,9 +15,11 @@ from wsgiref import simple_server from wsgiref.util import FileWrapper # NOQA: for backwards compatibility from django.core.exceptions import ImproperlyConfigured from django.core.handlers.wsgi import ISO_8859_1, UTF_8 from django.core.management.color import color_style from django.core.wsgi import get_wsgi_application from django.utils import six from django.utils.encoding import uri_to_iri from django.utils.module_loading import import_string from django.utils.six.moves import socketserver Loading Loading @@ -117,6 +119,21 @@ class WSGIRequestHandler(simple_server.WSGIRequestHandler, object): sys.stderr.write(msg) def get_environ(self): env = super(WSGIRequestHandler, self).get_environ() path = self.path if '?' in path: path = path.partition('?')[0] path = uri_to_iri(path).encode(UTF_8) # Under Python 3, non-ASCII values in the WSGI environ are arbitrarily # decoded with ISO-8859-1. We replicate this behavior here. # Refs comment in `get_bytes_from_wsgi()`. env['PATH_INFO'] = path.decode(ISO_8859_1) if six.PY3 else path return env def run(addr, port, wsgi_handler, ipv6=False, threading=False): server_address = (addr, port) Loading django/test/client.py +8 −8 Original line number Diff line number Diff line Loading @@ -12,7 +12,7 @@ from django.apps import apps from django.conf import settings from django.core import urlresolvers from django.core.handlers.base import BaseHandler from django.core.handlers.wsgi import WSGIRequest from django.core.handlers.wsgi import WSGIRequest, ISO_8859_1, UTF_8 from django.core.signals import (request_started, request_finished, got_request_exception) from django.db import close_old_connections Loading @@ -20,11 +20,11 @@ from django.http import SimpleCookie, HttpRequest, QueryDict from django.template import TemplateDoesNotExist from django.test import signals from django.utils.functional import curry, SimpleLazyObject from django.utils.encoding import force_bytes, force_str from django.utils.encoding import force_bytes, force_str, uri_to_iri from django.utils.http import urlencode from django.utils.itercompat import is_iterable from django.utils import six from django.utils.six.moves.urllib.parse import unquote, urlparse, urlsplit from django.utils.six.moves.urllib.parse import urlparse, urlsplit from django.test.utils import ContextList __all__ = ('Client', 'RequestFactory', 'encode_file', 'encode_multipart') Loading Loading @@ -270,11 +270,11 @@ class RequestFactory(object): # If there are parameters, add them if parsed[3]: path += str(";") + force_str(parsed[3]) path = unquote(path) # WSGI requires latin-1 encoded strings. See get_path_info(). if six.PY3: path = path.encode('utf-8').decode('iso-8859-1') return path path = uri_to_iri(path).encode(UTF_8) # Under Python 3, non-ASCII values in the WSGI environ are arbitrarily # decoded with ISO-8859-1. We replicate this behavior here. # Refs comment in `get_bytes_from_wsgi()`. return path.decode(ISO_8859_1) if six.PY3 else path def get(self, path, data=None, secure=False, **extra): "Construct a GET request." Loading django/utils/encoding.py +39 −2 Original line number Diff line number Diff line # -*- encoding: utf-8 -*- from __future__ import unicode_literals import codecs Loading @@ -7,7 +8,9 @@ import locale from django.utils.functional import Promise from django.utils import six from django.utils.six.moves.urllib.parse import quote from django.utils.six.moves.urllib.parse import quote, unquote if six.PY3: from urllib.parse import unquote_to_bytes class DjangoUnicodeDecodeError(UnicodeDecodeError): Loading Loading @@ -185,7 +188,9 @@ def iri_to_uri(iri): assuming input is either UTF-8 or unicode already, we can simplify things a little from the full method. Returns an ASCII string containing the encoded result. Takes an IRI in UTF-8 bytes (e.g. '/I \xe2\x99\xa5 Django/') or unicode (e.g. '/I ♥ Django/') and returns ASCII bytes containing the encoded result (e.g. '/I%20%E2%99%A5%20Django/'). """ # The list of safe characters here is constructed from the "reserved" and # "unreserved" characters specified in sections 2.2 and 2.3 of RFC 3986: Loading @@ -204,6 +209,38 @@ def iri_to_uri(iri): return quote(force_bytes(iri), safe=b"/#%[]=:;$&()+,!?*@'~") def uri_to_iri(uri): """ Converts a Uniform Resource Identifier(URI) into an Internationalized Resource Identifier(IRI). This is the algorithm from section 3.2 of RFC 3987. Takes an URI in ASCII bytes (e.g. '/I%20%E2%99%A5%20Django/') and returns unicode containing the encoded result (e.g. '/I \xe2\x99\xa5 Django/'). """ if uri is None: return uri uri = force_bytes(uri) iri = unquote_to_bytes(uri) if six.PY3 else unquote(uri) return repercent_broken_unicode(iri).decode('utf-8') def repercent_broken_unicode(path): """ As per section 3.2 of RFC 3987, step three of converting a URI into an IRI, we need to re-percent-encode any octet produced that is not part of a strictly legal UTF-8 octet sequence. """ try: path.decode('utf-8') except UnicodeDecodeError as e: repercent = quote(path[e.start:e.end], safe=b"/#%[]=:;$&()+,!?*@'~") path = repercent_broken_unicode( path[:e.start] + force_bytes(repercent) + path[e.end:]) return path def filepath_to_uri(path): """Convert a file system path to a URI portion that is suitable for inclusion in a URL. Loading docs/ref/unicode.txt +24 −7 Original line number Diff line number Diff line Loading @@ -173,11 +173,11 @@ URL from an IRI_ -- very loosely speaking, a URI_ that can contain Unicode characters. Quoting and converting an IRI to URI can be a little tricky, so Django provides some assistance. * The function ``django.utils.encoding.iri_to_uri()`` implements the conversion from IRI to URI as required by the specification (:rfc:`3987`). * The function :func:`django.utils.encoding.iri_to_uri()` implements the conversion from IRI to URI as required by the specification (:rfc:`3987#section-3.1`). * The functions ``django.utils.http.urlquote()`` and ``django.utils.http.urlquote_plus()`` are versions of Python's standard * The functions :func:`django.utils.http.urlquote()` and :func:`django.utils.http.urlquote_plus()` are versions of Python's standard ``urllib.quote()`` and ``urllib.quote_plus()`` that work with non-ASCII characters. (The data is converted to UTF-8 prior to encoding.) Loading Loading @@ -213,12 +213,29 @@ you can construct your IRI without worrying about whether it contains non-ASCII characters and then, right at the end, call ``iri_to_uri()`` on the result. The ``iri_to_uri()`` function is also idempotent, which means the following is always true:: Similarly, Django provides :func:`django.utils.encoding.uri_to_iri()` which implements the conversion from URI to IRI as per :rfc:`3987#section-3.2`. It decodes all percent-encodings except those that don't represent a valid UTF-8 sequence. An example to demonstrate:: >>> uri_to_iri('/%E2%99%A5%E2%99%A5/?utf8=%E2%9C%93') '/♥♥/?utf8=✓' >>> uri_to_iri('%A9helloworld') '%A9helloworld' In the first example, the UTF-8 characters and reserved characters are unquoted. In the second, the percent-encoding remains unchanged because it lies outside the valid UTF-8 range. Both ``iri_to_uri()`` and ``uri_to_iri()`` functions are idempotent, which means the following is always true:: iri_to_uri(iri_to_uri(some_string)) = iri_to_uri(some_string) uri_to_iri(uri_to_iri(some_string)) = uri_to_iri(some_string) So you can safely call it multiple times on the same IRI without risking So you can safely call it multiple times on the same URI/IRI without risking double-quoting problems. .. _URI: http://www.ietf.org/rfc/rfc2396.txt Loading Loading
django/core/handlers/wsgi.py +4 −7 Original line number Diff line number Diff line Loading @@ -206,7 +206,6 @@ def get_path_info(environ): """ path_info = get_bytes_from_wsgi(environ, 'PATH_INFO', '/') # It'd be better to implement URI-to-IRI decoding, see #19508. return path_info.decode(UTF_8) Loading Loading @@ -236,7 +235,6 @@ def get_script_name(environ): else: script_name = get_bytes_from_wsgi(environ, 'SCRIPT_NAME', '') # It'd be better to implement URI-to-IRI decoding, see #19508. return script_name.decode(UTF_8) Loading @@ -251,16 +249,15 @@ def get_bytes_from_wsgi(environ, key, default): # Under Python 3, non-ASCII values in the WSGI environ are arbitrarily # decoded with ISO-8859-1. This is wrong for Django websites where UTF-8 # is the default. Re-encode to recover the original bytestring. return value if six.PY2 else value.encode(ISO_8859_1) return value.encode(ISO_8859_1) if six.PY3 else value def get_str_from_wsgi(environ, key, default): """ Get a value from the WSGI environ dictionary as bytes. Get a value from the WSGI environ dictionary as str. key and default should be str objects. Under Python 2 they may also be unicode objects provided they only contain ASCII characters. """ value = environ.get(str(key), str(default)) # Same comment as above return value if six.PY2 else value.encode(ISO_8859_1).decode(UTF_8, errors='replace') value = get_bytes_from_wsgi(environ, key, default) return value.decode(UTF_8, errors='replace') if six.PY3 else value
django/core/servers/basehttp.py +17 −0 Original line number Diff line number Diff line Loading @@ -15,9 +15,11 @@ from wsgiref import simple_server from wsgiref.util import FileWrapper # NOQA: for backwards compatibility from django.core.exceptions import ImproperlyConfigured from django.core.handlers.wsgi import ISO_8859_1, UTF_8 from django.core.management.color import color_style from django.core.wsgi import get_wsgi_application from django.utils import six from django.utils.encoding import uri_to_iri from django.utils.module_loading import import_string from django.utils.six.moves import socketserver Loading Loading @@ -117,6 +119,21 @@ class WSGIRequestHandler(simple_server.WSGIRequestHandler, object): sys.stderr.write(msg) def get_environ(self): env = super(WSGIRequestHandler, self).get_environ() path = self.path if '?' in path: path = path.partition('?')[0] path = uri_to_iri(path).encode(UTF_8) # Under Python 3, non-ASCII values in the WSGI environ are arbitrarily # decoded with ISO-8859-1. We replicate this behavior here. # Refs comment in `get_bytes_from_wsgi()`. env['PATH_INFO'] = path.decode(ISO_8859_1) if six.PY3 else path return env def run(addr, port, wsgi_handler, ipv6=False, threading=False): server_address = (addr, port) Loading
django/test/client.py +8 −8 Original line number Diff line number Diff line Loading @@ -12,7 +12,7 @@ from django.apps import apps from django.conf import settings from django.core import urlresolvers from django.core.handlers.base import BaseHandler from django.core.handlers.wsgi import WSGIRequest from django.core.handlers.wsgi import WSGIRequest, ISO_8859_1, UTF_8 from django.core.signals import (request_started, request_finished, got_request_exception) from django.db import close_old_connections Loading @@ -20,11 +20,11 @@ from django.http import SimpleCookie, HttpRequest, QueryDict from django.template import TemplateDoesNotExist from django.test import signals from django.utils.functional import curry, SimpleLazyObject from django.utils.encoding import force_bytes, force_str from django.utils.encoding import force_bytes, force_str, uri_to_iri from django.utils.http import urlencode from django.utils.itercompat import is_iterable from django.utils import six from django.utils.six.moves.urllib.parse import unquote, urlparse, urlsplit from django.utils.six.moves.urllib.parse import urlparse, urlsplit from django.test.utils import ContextList __all__ = ('Client', 'RequestFactory', 'encode_file', 'encode_multipart') Loading Loading @@ -270,11 +270,11 @@ class RequestFactory(object): # If there are parameters, add them if parsed[3]: path += str(";") + force_str(parsed[3]) path = unquote(path) # WSGI requires latin-1 encoded strings. See get_path_info(). if six.PY3: path = path.encode('utf-8').decode('iso-8859-1') return path path = uri_to_iri(path).encode(UTF_8) # Under Python 3, non-ASCII values in the WSGI environ are arbitrarily # decoded with ISO-8859-1. We replicate this behavior here. # Refs comment in `get_bytes_from_wsgi()`. return path.decode(ISO_8859_1) if six.PY3 else path def get(self, path, data=None, secure=False, **extra): "Construct a GET request." Loading
django/utils/encoding.py +39 −2 Original line number Diff line number Diff line # -*- encoding: utf-8 -*- from __future__ import unicode_literals import codecs Loading @@ -7,7 +8,9 @@ import locale from django.utils.functional import Promise from django.utils import six from django.utils.six.moves.urllib.parse import quote from django.utils.six.moves.urllib.parse import quote, unquote if six.PY3: from urllib.parse import unquote_to_bytes class DjangoUnicodeDecodeError(UnicodeDecodeError): Loading Loading @@ -185,7 +188,9 @@ def iri_to_uri(iri): assuming input is either UTF-8 or unicode already, we can simplify things a little from the full method. Returns an ASCII string containing the encoded result. Takes an IRI in UTF-8 bytes (e.g. '/I \xe2\x99\xa5 Django/') or unicode (e.g. '/I ♥ Django/') and returns ASCII bytes containing the encoded result (e.g. '/I%20%E2%99%A5%20Django/'). """ # The list of safe characters here is constructed from the "reserved" and # "unreserved" characters specified in sections 2.2 and 2.3 of RFC 3986: Loading @@ -204,6 +209,38 @@ def iri_to_uri(iri): return quote(force_bytes(iri), safe=b"/#%[]=:;$&()+,!?*@'~") def uri_to_iri(uri): """ Converts a Uniform Resource Identifier(URI) into an Internationalized Resource Identifier(IRI). This is the algorithm from section 3.2 of RFC 3987. Takes an URI in ASCII bytes (e.g. '/I%20%E2%99%A5%20Django/') and returns unicode containing the encoded result (e.g. '/I \xe2\x99\xa5 Django/'). """ if uri is None: return uri uri = force_bytes(uri) iri = unquote_to_bytes(uri) if six.PY3 else unquote(uri) return repercent_broken_unicode(iri).decode('utf-8') def repercent_broken_unicode(path): """ As per section 3.2 of RFC 3987, step three of converting a URI into an IRI, we need to re-percent-encode any octet produced that is not part of a strictly legal UTF-8 octet sequence. """ try: path.decode('utf-8') except UnicodeDecodeError as e: repercent = quote(path[e.start:e.end], safe=b"/#%[]=:;$&()+,!?*@'~") path = repercent_broken_unicode( path[:e.start] + force_bytes(repercent) + path[e.end:]) return path def filepath_to_uri(path): """Convert a file system path to a URI portion that is suitable for inclusion in a URL. Loading
docs/ref/unicode.txt +24 −7 Original line number Diff line number Diff line Loading @@ -173,11 +173,11 @@ URL from an IRI_ -- very loosely speaking, a URI_ that can contain Unicode characters. Quoting and converting an IRI to URI can be a little tricky, so Django provides some assistance. * The function ``django.utils.encoding.iri_to_uri()`` implements the conversion from IRI to URI as required by the specification (:rfc:`3987`). * The function :func:`django.utils.encoding.iri_to_uri()` implements the conversion from IRI to URI as required by the specification (:rfc:`3987#section-3.1`). * The functions ``django.utils.http.urlquote()`` and ``django.utils.http.urlquote_plus()`` are versions of Python's standard * The functions :func:`django.utils.http.urlquote()` and :func:`django.utils.http.urlquote_plus()` are versions of Python's standard ``urllib.quote()`` and ``urllib.quote_plus()`` that work with non-ASCII characters. (The data is converted to UTF-8 prior to encoding.) Loading Loading @@ -213,12 +213,29 @@ you can construct your IRI without worrying about whether it contains non-ASCII characters and then, right at the end, call ``iri_to_uri()`` on the result. The ``iri_to_uri()`` function is also idempotent, which means the following is always true:: Similarly, Django provides :func:`django.utils.encoding.uri_to_iri()` which implements the conversion from URI to IRI as per :rfc:`3987#section-3.2`. It decodes all percent-encodings except those that don't represent a valid UTF-8 sequence. An example to demonstrate:: >>> uri_to_iri('/%E2%99%A5%E2%99%A5/?utf8=%E2%9C%93') '/♥♥/?utf8=✓' >>> uri_to_iri('%A9helloworld') '%A9helloworld' In the first example, the UTF-8 characters and reserved characters are unquoted. In the second, the percent-encoding remains unchanged because it lies outside the valid UTF-8 range. Both ``iri_to_uri()`` and ``uri_to_iri()`` functions are idempotent, which means the following is always true:: iri_to_uri(iri_to_uri(some_string)) = iri_to_uri(some_string) uri_to_iri(uri_to_iri(some_string)) = uri_to_iri(some_string) So you can safely call it multiple times on the same IRI without risking So you can safely call it multiple times on the same URI/IRI without risking double-quoting problems. .. _URI: http://www.ietf.org/rfc/rfc2396.txt Loading