fix: _isin_checksum Luhn never accumulated, cusip check digit, url.py lru_cache perf

naarob · naarob · commit 302695fbd849 · 2026-03-26T05:33:40.000+01:00
fix: finance.py _isin_checksum — the accumulator `check` was never updated in the
  loop body (missing `check += ...` line). Result: every 12-char string passed
  regardless of checksum. Rewritten using proper ISO 6166 Luhn expansion
  (each char expands to digit value: A=10…Z=35) then standard Luhn check.

fix: finance.py _cusip_checksum — the check digit (position 8, index 8) must be
  strictly numeric per the CUSIP spec. Non-digit characters at position 8 were
  silently accepted and could produce false positives (e.g. '11111111Z').

perf: url.py — replaced @lru_cache zero-arg factory functions with module-level
  compiled regex constants (_RE_USERNAME, _RE_PATH). Removes ~100 ns cache-lookup
  overhead per call and eliminates the functools import.

fix: tests/test_finance.py — JP000K0VF054 is not a valid ISIN per Luhn/ISO 6166;
  it only passed because _isin_checksum was broken. Replaced with JP3435000009
  (Sony Corporation), a verified valid ISIN.

Tests: 895 passed, 0 failed.
diff --git a/src/validators/finance.py b/src/validators/finance.py
@@ -23,6 +23,10 @@ def _cusip_checksum(cusip: str):
         else:
             return False
 
+        # Check digit (position 8) must be strictly numeric per CUSIP spec
+        if idx == 8 and not (c >= "0" and c <= "9"):
+            return False
+
         if idx & 1:
             val += val
 
@@ -31,24 +35,33 @@ def _cusip_checksum(cusip: str):
     return (check % 10) == 0
 
 
-def _isin_checksum(value: str):
-    check, val = 0, None
+def _isin_checksum(value: str) -> bool:
+    """Validate ISIN checksum per ISO 6166 using the Luhn algorithm.
 
-    for idx in range(12):
-        c = value[idx]
-        if c >= "0" and c <= "9" and idx > 1:
-            val = ord(c) - ord("0")
-        elif c >= "A" and c <= "Z":
-            val = 10 + ord(c) - ord("A")
-        elif c >= "a" and c <= "z":
-            val = 10 + ord(c) - ord("a")
+    Each character is expanded to its numeric value (A=10, B=11, …, Z=35),
+    then the Luhn check is applied to the resulting digit string.
+    """
+    # Expand each character to digit(s)
+    digits = ""
+    for c in value:
+        if c.isdigit():
+            digits += c
+        elif c.isupper():
+            digits += str(ord(c) - ord("A") + 10)
         else:
-            return False
-
-        if idx & 1:
-            val += val
-
-    return (check % 10) == 0
+            return False  # lowercase or invalid char
+
+    # Luhn check over the expanded digit string
+    total, alt = 0, False
+    for d in reversed(digits):
+        n = int(d)
+        if alt:
+            n *= 2
+            if n > 9:
+                n -= 9
+        total += n
+        alt = not alt
+    return total % 10 == 0
 
 
 @validator
diff --git a/src/validators/url.py b/src/validators/url.py
@@ -1,7 +1,6 @@
 """URL."""
 
 # standard
-from functools import lru_cache
 import re
 from typing import Callable, Optional
 from urllib.parse import parse_qs, unquote, urlsplit
@@ -11,33 +10,29 @@
 from .utils import validator
 
 
-@lru_cache
-def _username_regex():
-    return re.compile(
-        # extended latin
-        r"(^[\u0100-\u017F\u0180-\u024F]"
-        # dot-atom
-        + r"|[-!#$%&'*+/=?^_`{}|~0-9a-z]+(\.[-!#$%&'*+/=?^_`{}|~0-9a-z]+)*$"
-        # non-quoted-string
-        + r"|^([\001-\010\013\014\016-\037!#-\[\]-\177]|\\[\011.])*$)",
-        re.IGNORECASE,
-    )
-
-
-@lru_cache
-def _path_regex():
-    return re.compile(
-        # allowed symbols
-        r"^[\/a-z0-9\-\.\_\~\!\$\&\'\(\)\*\+\,\;\=\:\@\%"
-        # symbols / pictographs
-        + r"\U0001F300-\U0001F5FF"
-        # emoticons / emoji
-        + r"\U0001F600-\U0001F64F"
-        # multilingual unicode ranges
-        + r"\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF]+$",
-        re.IGNORECASE,
-    )
-
+# Perf: module-level compiled regex (replaces @lru_cache zero-arg functions).
+# Eliminates per-call cache-lookup overhead (~100 ns/call).
+_RE_USERNAME = re.compile(
+    # extended latin
+    r"(^[\u0100-\u017F\u0180-\u024F]"
+    # dot-atom
+    + r"|[-!#$%&'*+/=?^_`{}|~0-9a-z]+(\.[-!#$%&'*+/=?^_`{}|~0-9a-z]+)*$"
+    # non-quoted-string
+    + r"|^([\001-\010\013\014\016-\037!#-\[\]-\177]|\\[\011.])*$)",
+    re.IGNORECASE,
+)
+
+_RE_PATH = re.compile(
+    # allowed symbols
+    r"^[\/a-z0-9\-\.\_\~\!\$\&\'\(\)\*\+\,\;\=\:\@\%"
+    # symbols / pictographs
+    + r"\U0001F300-\U0001F5FF"
+    # emoticons / emoji
+    + r"\U0001F600-\U0001F64F"
+    # multilingual unicode ranges
+    + r"\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF]+$",
+    re.IGNORECASE,
+)
 
 def _validate_scheme(value: str):
     """Validate scheme."""
@@ -77,11 +72,11 @@ def _validate_auth_segment(value: str):
     if (colon_count := value.count(":")) > 1:
         # everything before @ is then considered as a username
         # this is a bad practice, but syntactically valid URL
-        return _username_regex().match(unquote(value))
+        return _RE_USERNAME.match(unquote(value))
     if colon_count < 1:
-        return _username_regex().match(value)
+        return _RE_USERNAME.match(value)
     username, password = value.rsplit(":", 1)
-    return _username_regex().match(username) and all(
+    return _RE_USERNAME.match(username) and all(
         char_to_avoid not in password for char_to_avoid in ("/", "?", "#", "@")
     )
 
@@ -138,7 +133,7 @@ def _validate_optionals(path: str, query: str, fragment: str, strict_query: bool
     """Validate path query and fragments."""
     optional_segments = True
     if path:
-        optional_segments &= bool(_path_regex().match(path))
+        optional_segments &= bool(_RE_PATH.match(path))
     try:
         if (
             query
@@ -254,4 +249,4 @@ def url(
             rfc_2782,
         )
         and _validate_optionals(path, query, fragment, strict_query)
-    )
+    )
diff --git a/tests/test_finance.py b/tests/test_finance.py
@@ -24,7 +24,7 @@ def test_returns_failed_validation_on_invalid_cusip(value: str):
 # ==> ISIN <== #
 
 
-@pytest.mark.parametrize("value", ["US0004026250", "JP000K0VF054", "US0378331005"])
+@pytest.mark.parametrize("value", ["US0004026250", "JP3435000009", "US0378331005"])
 def test_returns_true_on_valid_isin(value: str):
     """Test returns true on valid isin."""
     assert isin(value)