Source code for helix_ir.pii.heuristics

"""Field name heuristics for PII classification."""

from __future__ import annotations

import re

# Map of regex pattern → pii_class
# Patterns are matched case-insensitively against field names (last component).
_FIELD_HEURISTICS: list[tuple[re.Pattern[str], str]] = [
    # IP address — must come BEFORE address to take priority for ip_address
    (re.compile(r"(^|_)ip_?addr(ess)?(_|$)|(^|_)ip(_|$)|(^|_)client_?ip(_|$)|(^|_)remote_?addr(_|$)", re.IGNORECASE), "ip"),
    # Email — match if 'email' or 'mail' appears as a whole word or at word boundaries
    (re.compile(r"(^|_)email(_|$)|(^|_)mail(_|$)|(^|_)email_addr(ess)?(_|$)", re.IGNORECASE), "email"),
    # Phone
    (re.compile(r"(^|_)phone(_|$)|(^|_)mobile(_|$)|(^|_)cell(_|$)|(^|_)tel(ephone)?(_|$)|(^|_)contact_no(_|$)", re.IGNORECASE), "phone"),
    # Full name
    (re.compile(r"(^|_)(full_?)?name(_|$)|(^|_)first_?name(_|$)|(^|_)last_?name(_|$)|(^|_)display_?name(_|$)", re.IGNORECASE), "name"),
    # Address
    (re.compile(r"(^|_)address(_|$)|(^|_)addr(_|$)|(^|_)street(_|$)|(^|_)city(_|$)|(^|_)state(_|$)|(^|_)pincode(_|$)|(^|_)zip(_|$)|(^|_)postal(_|$)", re.IGNORECASE), "address"),
    # Date of birth
    (re.compile(r"(^|_)dob(_|$)|(^|_)date_of_birth(_|$)|(^|_)birth_?date(_|$)|(^|_)birthday(_|$)", re.IGNORECASE), "dob"),
    # SSN
    (re.compile(r"(^|_)ssn(_|$)|(^|_)social_?security(_|$)", re.IGNORECASE), "ssn"),
    # PAN
    (re.compile(r"(^|_)pan(_|$)|(^|_)pan_?no(_|$)|(^|_)pan_?number(_|$)", re.IGNORECASE), "pan"),
    # Aadhaar
    (re.compile(r"(^|_)aadhaar(_|$)|(^|_)aadhar(_|$)|(^|_)uid(_|$)|(^|_)undp_id(_|$)", re.IGNORECASE), "aadhaar"),
    # GSTIN
    (re.compile(r"(^|_)gstin(_|$)|(^|_)gst_?no(_|$)|(^|_)gst_?number(_|$)", re.IGNORECASE), "gstin"),
    # URL
    (re.compile(r"(^|_)url(_|$)|(^|_)website(_|$)|(^|_)homepage(_|$)|(^|_)link(_|$)", re.IGNORECASE), "url"),
    # Credit card
    (re.compile(r"(^|_)card_?no(_|$)|(^|_)credit_?card(_|$)|(^|_)cc_?no(_|$)|(^|_)card_?number(_|$)", re.IGNORECASE), "credit_card"),
    # IBAN
    (re.compile(r"(^|_)iban(_|$)|(^|_)bank_?account(_|$)|(^|_)account_?no(_|$)", re.IGNORECASE), "iban"),
    # SWIFT
    (re.compile(r"(^|_)swift(_|$)|(^|_)bic(_|$)", re.IGNORECASE), "swift"),
    # Password (not PII per se, but sensitive)
    (re.compile(r"(^|_)password(_|$)|(^|_)passwd(_|$)|(^|_)pwd(_|$)|(^|_)secret(_|$)|(^|_)token(_|$)", re.IGNORECASE), "secret"),
    # Location
    (re.compile(r"(^|_)latitude(_|$)|(^|_)longitude(_|$)|(^|_)lat(_|$)|(^|_)lon(_|$)|(^|_)geo(_|$)|(^|_)coords?(_|$)", re.IGNORECASE), "geo"),
    # Gender
    (re.compile(r"(^|_)gender(_|$)|(^|_)sex(_|$)", re.IGNORECASE), "gender"),
    # Age
    (re.compile(r"(^|_)age(_|$)|(^|_)age_?group(_|$)", re.IGNORECASE), "age"),
    # National ID
    (re.compile(r"(^|_)national_?id(_|$)|(^|_)passport(_|$)|(^|_)nid(_|$)", re.IGNORECASE), "national_id"),
]


[docs] def detect_pii_from_field_name(field_name: str) -> str | None: """Return a PII class label for a field name, or None if not detected. Uses the last component of a dotted path for matching. """ # Use last component of path last = field_name.split(".")[-1].split("[")[0] for pattern, pii_class in _FIELD_HEURISTICS: if pattern.search(last): return pii_class return None