Fixed #19508 -- Implemented uri_to_iri as per RFC. (10b17a22) · Commits · Dom Sekotill / django

django/core/handlers/wsgi.py

+4 −7

Original line number	Diff line number	Diff line
		@@ -206,7 +206,6 @@ def get_path_info(environ):
		"""
		path_info = get_bytes_from_wsgi(environ, 'PATH_INFO', '/')

		# It'd be better to implement URI-to-IRI decoding, see #19508.
		return path_info.decode(UTF_8)


		@@ -236,7 +235,6 @@ def get_script_name(environ):
		else:
		script_name = get_bytes_from_wsgi(environ, 'SCRIPT_NAME', '')

		# It'd be better to implement URI-to-IRI decoding, see #19508.
		return script_name.decode(UTF_8)


		@@ -251,16 +249,15 @@ def get_bytes_from_wsgi(environ, key, default):
		# Under Python 3, non-ASCII values in the WSGI environ are arbitrarily
		# decoded with ISO-8859-1. This is wrong for Django websites where UTF-8
		# is the default. Re-encode to recover the original bytestring.
		return value if six.PY2 else value.encode(ISO_8859_1)
		return value.encode(ISO_8859_1) if six.PY3 else value


		def get_str_from_wsgi(environ, key, default):
		"""
		Get a value from the WSGI environ dictionary as bytes.
		Get a value from the WSGI environ dictionary as str.

		key and default should be str objects. Under Python 2 they may also be
		unicode objects provided they only contain ASCII characters.
		"""
		value = environ.get(str(key), str(default))
		# Same comment as above
		return value if six.PY2 else value.encode(ISO_8859_1).decode(UTF_8, errors='replace')
		value = get_bytes_from_wsgi(environ, key, default)
		return value.decode(UTF_8, errors='replace') if six.PY3 else value

django/core/servers/basehttp.py

+17 −0

Original line number	Diff line number	Diff line
		@@ -15,9 +15,11 @@ from wsgiref import simple_server
		from wsgiref.util import FileWrapper # NOQA: for backwards compatibility

		from django.core.exceptions import ImproperlyConfigured
		from django.core.handlers.wsgi import ISO_8859_1, UTF_8
		from django.core.management.color import color_style
		from django.core.wsgi import get_wsgi_application
		from django.utils import six
		from django.utils.encoding import uri_to_iri
		from django.utils.module_loading import import_string
		from django.utils.six.moves import socketserver

		@@ -117,6 +119,21 @@ class WSGIRequestHandler(simple_server.WSGIRequestHandler, object):

		sys.stderr.write(msg)

		def get_environ(self):
		env = super(WSGIRequestHandler, self).get_environ()

		path = self.path
		if '?' in path:
		path = path.partition('?')[0]

		path = uri_to_iri(path).encode(UTF_8)
		# Under Python 3, non-ASCII values in the WSGI environ are arbitrarily
		# decoded with ISO-8859-1. We replicate this behavior here.
		# Refs comment in `get_bytes_from_wsgi()`.
		env['PATH_INFO'] = path.decode(ISO_8859_1) if six.PY3 else path

		return env


		def run(addr, port, wsgi_handler, ipv6=False, threading=False):
		server_address = (addr, port)

django/test/client.py

+8 −8

Original line number	Diff line number	Diff line
		@@ -12,7 +12,7 @@ from django.apps import apps
		from django.conf import settings
		from django.core import urlresolvers
		from django.core.handlers.base import BaseHandler
		from django.core.handlers.wsgi import WSGIRequest
		from django.core.handlers.wsgi import WSGIRequest, ISO_8859_1, UTF_8
		from django.core.signals import (request_started, request_finished,
		got_request_exception)
		from django.db import close_old_connections
		@@ -20,11 +20,11 @@ from django.http import SimpleCookie, HttpRequest, QueryDict
		from django.template import TemplateDoesNotExist
		from django.test import signals
		from django.utils.functional import curry, SimpleLazyObject
		from django.utils.encoding import force_bytes, force_str
		from django.utils.encoding import force_bytes, force_str, uri_to_iri
		from django.utils.http import urlencode
		from django.utils.itercompat import is_iterable
		from django.utils import six
		from django.utils.six.moves.urllib.parse import unquote, urlparse, urlsplit
		from django.utils.six.moves.urllib.parse import urlparse, urlsplit
		from django.test.utils import ContextList

		__all__ = ('Client', 'RequestFactory', 'encode_file', 'encode_multipart')
		@@ -270,11 +270,11 @@ class RequestFactory(object):
		# If there are parameters, add them
		if parsed[3]:
		path += str(";") + force_str(parsed[3])
		path = unquote(path)
		# WSGI requires latin-1 encoded strings. See get_path_info().
		if six.PY3:
		path = path.encode('utf-8').decode('iso-8859-1')
		return path
		path = uri_to_iri(path).encode(UTF_8)
		# Under Python 3, non-ASCII values in the WSGI environ are arbitrarily
		# decoded with ISO-8859-1. We replicate this behavior here.
		# Refs comment in `get_bytes_from_wsgi()`.
		return path.decode(ISO_8859_1) if six.PY3 else path

		def get(self, path, data=None, secure=False, **extra):
		"Construct a GET request."

django/utils/encoding.py

+39 −2

Original line number	Diff line number	Diff line
		# -- encoding: utf-8 --
		from __future__ import unicode_literals

		import codecs
		@@ -7,7 +8,9 @@ import locale

		from django.utils.functional import Promise
		from django.utils import six
		from django.utils.six.moves.urllib.parse import quote
		from django.utils.six.moves.urllib.parse import quote, unquote
		if six.PY3:
		from urllib.parse import unquote_to_bytes


		class DjangoUnicodeDecodeError(UnicodeDecodeError):
		@@ -185,7 +188,9 @@ def iri_to_uri(iri):
		assuming input is either UTF-8 or unicode already, we can simplify things a
		little from the full method.

		Returns an ASCII string containing the encoded result.
		Takes an IRI in UTF-8 bytes (e.g. '/I \xe2\x99\xa5 Django/') or unicode
		(e.g. '/I ♥ Django/') and returns ASCII bytes containing the encoded result
		(e.g. '/I%20%E2%99%A5%20Django/').
		"""
		# The list of safe characters here is constructed from the "reserved" and
		# "unreserved" characters specified in sections 2.2 and 2.3 of RFC 3986:
		@@ -204,6 +209,38 @@ def iri_to_uri(iri):
		return quote(force_bytes(iri), safe=b"/#%[]=:;$&()+,!?*@'~")


		def uri_to_iri(uri):
		"""
		Converts a Uniform Resource Identifier(URI) into an Internationalized
		Resource Identifier(IRI).

		This is the algorithm from section 3.2 of RFC 3987.

		Takes an URI in ASCII bytes (e.g. '/I%20%E2%99%A5%20Django/') and returns
		unicode containing the encoded result (e.g. '/I \xe2\x99\xa5 Django/').
		"""
		if uri is None:
		return uri
		uri = force_bytes(uri)
		iri = unquote_to_bytes(uri) if six.PY3 else unquote(uri)
		return repercent_broken_unicode(iri).decode('utf-8')


		def repercent_broken_unicode(path):
		"""
		As per section 3.2 of RFC 3987, step three of converting a URI into an IRI,
		we need to re-percent-encode any octet produced that is not part of a
		strictly legal UTF-8 octet sequence.
		"""
		try:
		path.decode('utf-8')
		except UnicodeDecodeError as e:
		repercent = quote(path[e.start:e.end], safe=b"/#%[]=:;$&()+,!?*@'~")
		path = repercent_broken_unicode(
		path[:e.start] + force_bytes(repercent) + path[e.end:])
		return path


		def filepath_to_uri(path):
		"""Convert a file system path to a URI portion that is suitable for
		inclusion in a URL.

docs/ref/unicode.txt

+24 −7

Original line number	Diff line number	Diff line
		@@ -173,11 +173,11 @@ URL from an IRI_ -- very loosely speaking, a URI_ that can contain Unicode
		characters. Quoting and converting an IRI to URI can be a little tricky, so
		Django provides some assistance.

		* The function ``django.utils.encoding.iri_to_uri()`` implements the
		conversion from IRI to URI as required by the specification (:rfc:`3987`).
		* The function :func:`django.utils.encoding.iri_to_uri()` implements the
		conversion from IRI to URI as required by the specification (:rfc:`3987#section-3.1`).

		* The functions ``django.utils.http.urlquote()`` and
		``django.utils.http.urlquote_plus()`` are versions of Python's standard
		* The functions :func:`django.utils.http.urlquote()` and
		:func:`django.utils.http.urlquote_plus()` are versions of Python's standard
		``urllib.quote()`` and ``urllib.quote_plus()`` that work with non-ASCII
		characters. (The data is converted to UTF-8 prior to encoding.)

		@@ -213,12 +213,29 @@ you can construct your IRI without worrying about whether it contains
		non-ASCII characters and then, right at the end, call ``iri_to_uri()`` on the
		result.

		The ``iri_to_uri()`` function is also idempotent, which means the following is
		always true::
		Similarly, Django provides :func:`django.utils.encoding.uri_to_iri()` which
		implements the conversion from URI to IRI as per :rfc:`3987#section-3.2`.
		It decodes all percent-encodings except those that don't represent a valid
		UTF-8 sequence.

		An example to demonstrate::

		>>> uri_to_iri('/%E2%99%A5%E2%99%A5/?utf8=%E2%9C%93')
		'/♥♥/?utf8=✓'
		>>> uri_to_iri('%A9helloworld')
		'%A9helloworld'

		In the first example, the UTF-8 characters and reserved characters are
		unquoted. In the second, the percent-encoding remains unchanged because it
		lies outside the valid UTF-8 range.

		Both ``iri_to_uri()`` and ``uri_to_iri()`` functions are idempotent, which means the
		following is always true::

		iri_to_uri(iri_to_uri(some_string)) = iri_to_uri(some_string)
		uri_to_iri(uri_to_iri(some_string)) = uri_to_iri(some_string)

		So you can safely call it multiple times on the same IRI without risking
		So you can safely call it multiple times on the same URI/IRI without risking
		double-quoting problems.

		.. _URI: http://www.ietf.org/rfc/rfc2396.txt