Source code for rule_engine.suggestions

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
#  rule_engine/suggestions.py
#
#  Redistribution and use in source and binary forms, with or without
#  modification, are permitted provided that the following conditions are
#  met:
#
#  * Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
#  * Redistributions in binary form must reproduce the above
#    copyright notice, this list of conditions and the following disclaimer
#    in the documentation and/or other materials provided with the
#    distribution.
#  * Neither the name of the project nor the names of its
#    contributors may be used to endorse or promote products derived from
#    this software without specific prior written permission.
#
#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
#  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#

import functools
import re
from collections.abc import Iterable, Sequence
from typing import Any


def jaro_distance(str1: str, str2: str) -> float:
    if str1 == str2:
        return 1.0

    str1_len = len(str1)
    str2_len = len(str2)
    max_len = max(str1_len, str2_len)
    match_distance = (max_len // 2) - 1
    str1_matches = [False] * max_len
    str2_matches = [False] * max_len
    matches = 0.0

    for i in range(str1_len):
        start = max(0, i - match_distance)
        end = min(i + match_distance, str2_len - 1) + 1
        for j in range(start, end):
            if not str2_matches[j] and str1[i] == str2[j]:
                str1_matches[i] = True
                str2_matches[j] = True
                matches += 1
                break

    if matches == 0.0:
        return 0.0

    k = 0
    transpositions = 0.0
    for i in range(str1_len):
        if not str1_matches[i]:
            continue
        while not str2_matches[k]:
            k += 1
        if str1[i] != str2[k]:
            transpositions += 1.0
        k += 1
    return ((matches / str1_len) + (matches / str2_len) + ((matches - transpositions / 2.0) / matches)) / 3.0

def jaro_winkler_distance(str1: str, str2: str, scale: float = 0.1) -> float:
    jaro_dist = jaro_distance(str1, str2)
    if jaro_dist > 0.7:
        prefix = 0
        while prefix < min(len(str1), len(str2), 5) and str1[prefix] == str2[prefix]:
            prefix += 1
        jaro_dist += scale * prefix * (1 - jaro_dist)
    return jaro_dist

def jaro_winkler_similarity(*args: Any, **kwargs: Any) -> float:
    return 1 - jaro_winkler_distance(*args, **kwargs)

def _suggest(word: str, options: Sequence[str]) -> str | None:
    if not len(options):
        return None
    return sorted(options, key=functools.partial(jaro_winkler_similarity, word))[0]

[docs] def suggest_symbol(word: str, options: Iterable[str]) -> str | None: """ Select the best match for *word* from a list of value *options*. Values that are not suitable symbol names will be filtered out of *options*. If no match is found, this function will return None. :param str word: The original word to suggest an alternative for. :param tuple options: A list of strings to select the best match from. :return: The best replacement for *word*. :rtype: str """ from .parser import Parser # avoid circular imports symbol_regex = '^' + Parser.get_token_regex('SYMBOL') + '$' return _suggest(word, [option for option in options if re.match(symbol_regex, option)])