Skip to content

Dedup values

dedup_values

Dedup-values strategy — collapse repeated string values in JSON.

DedupValuesStrategy

Bases: SmeltStrategy

Replace frequently repeated long string values with aliases.

Source code in packages/axm-smelt/src/axm_smelt/strategies/dedup_values.py
Python
class DedupValuesStrategy(SmeltStrategy):
    """Replace frequently repeated long string values with aliases."""

    @property
    def name(self) -> str:
        return "dedup_values"

    @property
    def category(self) -> str:
        return "structural"

    def apply(self, ctx: SmeltContext) -> SmeltContext:
        """Replace repeated long strings with short aliases.

        Uses ``ctx.parsed`` when available to skip
        ``json.loads``. Wraps the result in a ``{_refs, _data}``
        envelope.
        """
        parsed = ctx.parsed
        if parsed is None:
            text = ctx.text
            stripped = text.strip()
            if not stripped or stripped[0] not in ("{", "["):
                return ctx
            try:
                parsed = json.loads(stripped)
            except (json.JSONDecodeError, ValueError):
                return ctx

        strings: list[str] = []
        _collect_strings(parsed, strings)
        counts = Counter(strings)

        # Build alias map for repeated strings
        repeated = {
            s: count for s, count in counts.items() if count >= _MIN_OCCURRENCES
        }
        if not repeated:
            return ctx

        # Sort by savings (length * count) descending
        by_savings = sorted(repeated, key=lambda s: len(s) * repeated[s], reverse=True)

        lookup: dict[str, str] = {}
        aliases: dict[str, str] = {}
        for i, s in enumerate(by_savings):
            alias = f"$R{i}"
            lookup[s] = alias
            aliases[alias] = s

        replaced = _replace_strings(parsed, lookup)
        result = {"_refs": aliases, "_data": replaced}
        return SmeltContext(
            text=json.dumps(result, separators=(",", ":"), ensure_ascii=False),
            format=ctx.format,
        )
apply(ctx)

Replace repeated long strings with short aliases.

Uses ctx.parsed when available to skip json.loads. Wraps the result in a {_refs, _data} envelope.

Source code in packages/axm-smelt/src/axm_smelt/strategies/dedup_values.py
Python
def apply(self, ctx: SmeltContext) -> SmeltContext:
    """Replace repeated long strings with short aliases.

    Uses ``ctx.parsed`` when available to skip
    ``json.loads``. Wraps the result in a ``{_refs, _data}``
    envelope.
    """
    parsed = ctx.parsed
    if parsed is None:
        text = ctx.text
        stripped = text.strip()
        if not stripped or stripped[0] not in ("{", "["):
            return ctx
        try:
            parsed = json.loads(stripped)
        except (json.JSONDecodeError, ValueError):
            return ctx

    strings: list[str] = []
    _collect_strings(parsed, strings)
    counts = Counter(strings)

    # Build alias map for repeated strings
    repeated = {
        s: count for s, count in counts.items() if count >= _MIN_OCCURRENCES
    }
    if not repeated:
        return ctx

    # Sort by savings (length * count) descending
    by_savings = sorted(repeated, key=lambda s: len(s) * repeated[s], reverse=True)

    lookup: dict[str, str] = {}
    aliases: dict[str, str] = {}
    for i, s in enumerate(by_savings):
        alias = f"$R{i}"
        lookup[s] = alias
        aliases[alias] = s

    replaced = _replace_strings(parsed, lookup)
    result = {"_refs": aliases, "_data": replaced}
    return SmeltContext(
        text=json.dumps(result, separators=(",", ":"), ensure_ascii=False),
        format=ctx.format,
    )