Commit c2f753a8 authored by Malcolm Tredinnick's avatar Malcolm Tredinnick
Browse files

Started to work on the regex reverse-engineering phase.


git-svn-id: http://code.djangoproject.com/svn/django/trunk@7850 bcc190cf-cafb-0310-a4f2-bffc1f526a37
parent c88e55b4
Loading
Loading
Loading
Loading
+123 −0
Original line number Diff line number Diff line
"""
Functions for reversing a regular expression (used in reverse URL resolving).

This is not, and is not intended to be, a complete reg-exp decompiler. It
should be good enough for almost all sane URLs.
"""

import re
from bisect import bisect

GROUP_CLASS = re.compile(r'''\((?:
        (?P<positional>[^?])|       # Unnamed (positional) capturing group.
        \?(?:
            P<(?P<named>[\w]+)>(?P<contents>.*)|    # Named capturing group.
            P=(?P<repeat>.+)|       # Repeat of a previous named group.
            (?P<grouping>:)|        # Non-capturing grouping parens.
            (?P<comment>\#)|        # Comment group
            (?P<illegal>.)          # Anything else (which will be an error)
        )
    ).*\)''', re.VERBOSE)

def normalize(pattern):
    """
    Given a reg-exp pattern, normalizes it to a list of forms that suffice for
    reverse matching. This does the following:

    (1) For any repeating sections, keeps the minimum number of occurrences
    permitted (this means zero for optional groups).
    (2) If an optional group includes parameters, include one occurrence of
    that group (along with the zero occurrence case from step (1)).
    (3) Select the first (essentially an arbitrary) element from any character
    class. Select an arbitrary character for any unordered class (e.g. '.' or
    '\w') in the pattern.
    (4) Take the first alternative in any '|' division, unless other
    alternatives would involve different parameters.
    (5) Ignore comments. Error on all other non-capturing (?...) forms (e.g.
    look-ahead and look-behind matches).

    Returns a list of tuples, each tuple containing (a) a pattern, (b) the
    number of parameters, (c) the names of the parameters. Any unnamed
    parameters are called '_0', '_1', etc.
    """
    # Do a linear scan to work out the special features of this pattern. The
    # idea is that we scan once here and collect all the information we need to
    # make future decisions.
    groups = []             # (start, end)
    quantifiers = []        # start pos
    ranges = []             # (start, end)
    eols = []               # pos
    disjunctions = []       # pos
    unclosed_groups = []
    unclosed_ranges = []
    escaped = False
    quantify = False
    in_range = False
    for pos, c in enumerate(pattern):
        if in_range and c != ']' or (c == ']' and
                unclosed_ranges[-1] == pos - 1):
            continue
        elif c == '[':
            unclosed_ranges.append(pos)
        elif c == ']':
            ranges.append((unclosed_ranges.pop(), pos + 1))
            in_range = False
        elif c == '.':
            # Treat this as a one-character long range:
            ranges.append((pos, pos + 1))
        elif escaped or c == '\\':
            escaped = not escaped
        elif c == '(':
            unclosed_groups.append(pos)
        elif c == ')':
            groups.append((unclosed_groups.pop(), pos + 1))
        elif quantify and c == '?':
            quantify = False
        elif c in '?*+{':
            quantifiers.append(pos)
            quantify = True
        elif c == '$':
            eols.append(pos)
        elif c == '|':
            disjunctions.append(pos)

    # Now classify each of the parenthetical groups to work out which ones take
    # parameters. Only the outer-most of a set of nested capturing groups is
    # important.
    groups.sort()
    params = []
    comments = []
    last_end = 0
    for start, end in groups:
        if start < last_end:
            # Skip over inner nested capturing groups.
            continue
        m = GROUP_CLASS.match(pattern, start)
        if m.group('positional'):
            params.append((start, end, '_%d' % len(params), start + 1))
        elif m.group('named'):
            params.append((start, end, m.group('named'), m.start('contents')))
        elif m.group('repeat'):
            params.append((start, end, m.group('repeat'), start + 1))
        elif m.group('illegal'):
            raise ValueError('The pattern construct %r is not valid here.'
                    % pattern[start:end])
        elif m.group('comment'):
            comments.extend([start, end])
        else:
            # This is a non-capturing set, so nesting prohibitions don't apply
            # to any inner groups.
            continue
        last_end = end

    # XXX: Got to here!
    results = []
    end = groups[0][0]
    # The first bit, before the first group starts.
    if end == 0:
        # FIXME: don't want to handle this case just yet.
        raise Exception

    quant_end = bisect(quantifiers, end)
    range_end = bisect(ranges, end)
    dis_end = bisect(disjunctions, end)