Skip to content

Dedup values

dedup_values

Dedup-values strategy — collapse repeated string values in JSON.

DedupValuesStrategy

Bases: SmeltStrategy

Replace frequently repeated long string values with aliases.

Source code in packages/axm-smelt/src/axm_smelt/strategies/dedup_values.py
Python
class DedupValuesStrategy(SmeltStrategy):
    """Replace frequently repeated long string values with aliases."""

    @property
    def name(self) -> str:
        """Strategy identifier used in the registry."""
        return "dedup_values_with_refs"

    @property
    def category(self) -> str:
        """Strategy category (``structural``)."""
        return "structural"

    def apply(self, ctx: SmeltContext) -> SmeltContext:
        """Replace repeated long strings with short aliases.

        Uses ``ctx.parsed`` when available to skip
        ``json.loads``. Wraps the result in a ``{_refs, _data}``
        envelope.

        The keys ``_refs`` and ``_data`` are reserved for the output
        envelope. If the input is a dict that already contains either
        as a top-level key, the strategy is a pass-through to avoid
        silent collision in the wrapped output.
        """
        parsed = ctx.parsed
        if parsed is None:
            text = ctx.text
            stripped = text.strip()
            if not stripped or stripped[0] not in ("{", "["):
                return ctx
            try:
                parsed = json.loads(stripped)
            except (json.JSONDecodeError, ValueError):
                return ctx

        if isinstance(parsed, dict) and ("_refs" in parsed or "_data" in parsed):
            logger.debug(
                "dedup_values_with_refs: input has reserved top-level key, skipping"
            )
            return ctx

        strings: list[str] = []
        collect_strings(parsed, strings)
        counts = Counter(strings)

        # Build alias map for repeated strings
        repeated = {
            s: count for s, count in counts.items() if count >= _MIN_OCCURRENCES
        }
        if not repeated:
            return ctx

        # Sort by savings (length * count) descending
        by_savings = sorted(repeated, key=lambda s: len(s) * repeated[s], reverse=True)

        lookup: dict[str, str] = {}
        aliases: dict[str, str] = {}
        for i, s in enumerate(by_savings):
            alias = f"$R{i}"
            lookup[s] = alias
            aliases[alias] = s

        replaced = replace_strings(parsed, lookup)
        result = {"_refs": aliases, "_data": replaced}
        return SmeltContext(
            text=json.dumps(result, separators=(",", ":"), ensure_ascii=False),
            format=ctx.format,
        )
category property

Strategy category (structural).

name property

Strategy identifier used in the registry.

apply(ctx)

Replace repeated long strings with short aliases.

Uses ctx.parsed when available to skip json.loads. Wraps the result in a {_refs, _data} envelope.

The keys _refs and _data are reserved for the output envelope. If the input is a dict that already contains either as a top-level key, the strategy is a pass-through to avoid silent collision in the wrapped output.

Source code in packages/axm-smelt/src/axm_smelt/strategies/dedup_values.py
Python
def apply(self, ctx: SmeltContext) -> SmeltContext:
    """Replace repeated long strings with short aliases.

    Uses ``ctx.parsed`` when available to skip
    ``json.loads``. Wraps the result in a ``{_refs, _data}``
    envelope.

    The keys ``_refs`` and ``_data`` are reserved for the output
    envelope. If the input is a dict that already contains either
    as a top-level key, the strategy is a pass-through to avoid
    silent collision in the wrapped output.
    """
    parsed = ctx.parsed
    if parsed is None:
        text = ctx.text
        stripped = text.strip()
        if not stripped or stripped[0] not in ("{", "["):
            return ctx
        try:
            parsed = json.loads(stripped)
        except (json.JSONDecodeError, ValueError):
            return ctx

    if isinstance(parsed, dict) and ("_refs" in parsed or "_data" in parsed):
        logger.debug(
            "dedup_values_with_refs: input has reserved top-level key, skipping"
        )
        return ctx

    strings: list[str] = []
    collect_strings(parsed, strings)
    counts = Counter(strings)

    # Build alias map for repeated strings
    repeated = {
        s: count for s, count in counts.items() if count >= _MIN_OCCURRENCES
    }
    if not repeated:
        return ctx

    # Sort by savings (length * count) descending
    by_savings = sorted(repeated, key=lambda s: len(s) * repeated[s], reverse=True)

    lookup: dict[str, str] = {}
    aliases: dict[str, str] = {}
    for i, s in enumerate(by_savings):
        alias = f"$R{i}"
        lookup[s] = alias
        aliases[alias] = s

    replaced = replace_strings(parsed, lookup)
    result = {"_refs": aliases, "_data": replaced}
    return SmeltContext(
        text=json.dumps(result, separators=(",", ":"), ensure_ascii=False),
        format=ctx.format,
    )

collect_strings(data, strings)

Walk data and collect string values.

Source code in packages/axm-smelt/src/axm_smelt/strategies/dedup_values.py
Python
def collect_strings(data: JsonValue, strings: list[str]) -> None:
    """Walk data and collect string values."""
    if isinstance(data, str):
        if len(data) >= _MIN_LENGTH:
            strings.append(data)
    elif isinstance(data, dict):
        for v in data.values():
            collect_strings(v, strings)
    elif isinstance(data, list):
        for item in data:
            collect_strings(item, strings)

replace_strings(data, lookup)

Replace repeated string values with short aliases.

Source code in packages/axm-smelt/src/axm_smelt/strategies/dedup_values.py
Python
def replace_strings(
    data: JsonValue,
    lookup: dict[str, str],
) -> JsonValue:
    """Replace repeated string values with short aliases."""
    if isinstance(data, str) and data in lookup:
        return lookup[data]
    if isinstance(data, dict):
        return {k: replace_strings(v, lookup) for k, v in data.items()}
    if isinstance(data, list):
        return [replace_strings(item, lookup) for item in data]
    return data