Replace repeated long strings with short aliases.
Uses ctx.parsed when available to skip
json.loads. Wraps the result in a {_refs, _data}
envelope.
Source code in packages/axm-smelt/src/axm_smelt/strategies/dedup_values.py
| Python |
|---|
| def apply(self, ctx: SmeltContext) -> SmeltContext:
"""Replace repeated long strings with short aliases.
Uses ``ctx.parsed`` when available to skip
``json.loads``. Wraps the result in a ``{_refs, _data}``
envelope.
"""
parsed = ctx.parsed
if parsed is None:
text = ctx.text
stripped = text.strip()
if not stripped or stripped[0] not in ("{", "["):
return ctx
try:
parsed = json.loads(stripped)
except (json.JSONDecodeError, ValueError):
return ctx
strings: list[str] = []
_collect_strings(parsed, strings)
counts = Counter(strings)
# Build alias map for repeated strings
repeated = {
s: count for s, count in counts.items() if count >= _MIN_OCCURRENCES
}
if not repeated:
return ctx
# Sort by savings (length * count) descending
by_savings = sorted(repeated, key=lambda s: len(s) * repeated[s], reverse=True)
lookup: dict[str, str] = {}
aliases: dict[str, str] = {}
for i, s in enumerate(by_savings):
alias = f"$R{i}"
lookup[s] = alias
aliases[alias] = s
replaced = _replace_strings(parsed, lookup)
result = {"_refs": aliases, "_data": replaced}
return SmeltContext(
text=json.dumps(result, separators=(",", ":"), ensure_ascii=False),
format=ctx.format,
)
|