Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
769 changes: 769 additions & 0 deletions crates/headroom-core/src/transforms/content_detector.rs

Large diffs are not rendered by default.

4 changes: 4 additions & 0 deletions crates/headroom-core/src/transforms/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,13 @@

pub mod adaptive_sizer;
pub mod anchor_selector;
pub mod content_detector;
pub mod diff_compressor;
pub mod smart_crusher;

pub use content_detector::{
detect_content_type, is_json_array_of_dicts, ContentType, DetectionResult,
};
pub use diff_compressor::{
DiffCompressionResult, DiffCompressor, DiffCompressorConfig, DiffCompressorStats,
};
59 changes: 57 additions & 2 deletions crates/headroom-parity/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,15 @@ pub trait TransformComparator {
}

/// Compare a single fixture against a comparator and return an outcome.
///
/// f64 normalization: `serde_json` (without the `arbitrary_precision`
/// feature) has an asymmetry — values constructed via `json!(f64)` keep
/// full precision (e.g. `0.9500000000000001`), but values parsed from
/// fixture JSON sometimes round to a neighboring f64 (e.g. `0.95`,
/// differing by 1 ULP). To make comparisons robust we round-trip the
/// comparator's output through `to_string` + `from_str` so it goes
/// through the same lossy parser the fixture did. Bit-identical f64s
/// from both sides then compare equal.
pub fn compare_fixture(
comparator: &dyn TransformComparator,
fixture: &Fixture,
Expand All @@ -57,12 +66,15 @@ pub fn compare_fixture(
})
}
};
if actual == fixture.output {
let actual_normalized: serde_json::Value =
serde_json::from_str(&serde_json::to_string(&actual)?)
.context("re-parsing comparator output through serde_json (f64 normalization)")?;
if actual_normalized == fixture.output {
Ok(ComparisonOutcome::Match)
} else {
Ok(ComparisonOutcome::Diff {
expected: serde_json::to_string_pretty(&fixture.output)?,
actual: serde_json::to_string_pretty(&actual)?,
actual: serde_json::to_string_pretty(&actual_normalized)?,
})
}
}
Expand Down Expand Up @@ -401,6 +413,48 @@ impl TransformComparator for SmartCrusherComparator {
}
}

/// Real comparator for the `content_detector` transform. Drives the Rust
/// port over the recorded fixture inputs (a single JSON string) and
/// emits the same shape Python's recorder serializes for
/// `DetectionResult`:
///
/// ```json
/// {"content_type": "json_array", "confidence": 1.0, "metadata": {...}}
/// ```
///
/// Python's recorder relies on `_json_default` to serialize the
/// `DetectionResult` dataclass and the `ContentType` enum:
/// - dataclass → `asdict(...)` produces `{content_type, confidence, metadata}`.
/// - enum → its `.value` (the lowercase tag, e.g. "json_array").
///
/// Numeric fields in metadata are recorded as JSON numbers (Python ints
/// stay ints), so we mirror that exactly with `serde_json::Number`.
pub struct ContentDetectorComparator;

impl TransformComparator for ContentDetectorComparator {
fn name(&self) -> &str {
"content_detector"
}

fn run(
&self,
input: &serde_json::Value,
_config: &serde_json::Value,
) -> Result<serde_json::Value> {
use headroom_core::transforms::detect_content_type;

let content = input
.as_str()
.context("content_detector fixture input must be a JSON string")?;
let result = detect_content_type(content);
Ok(serde_json::json!({
"content_type": result.content_type.as_str(),
"confidence": result.confidence,
"metadata": serde_json::Value::Object(result.metadata),
}))
}
}

/// Every built-in comparator, in a stable order.
pub fn builtin_comparators() -> Vec<Box<dyn TransformComparator>> {
vec![
Expand All @@ -410,6 +464,7 @@ pub fn builtin_comparators() -> Vec<Box<dyn TransformComparator>> {
Box::new(TokenizerComparator),
Box::new(CcrComparator),
Box::new(SmartCrusherComparator),
Box::new(ContentDetectorComparator),
]
}

Expand Down
113 changes: 111 additions & 2 deletions crates/headroom-py/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,13 @@ use headroom_core::transforms::smart_crusher::{
SmartCrusherConfig as RustSmartCrusherConfig,
};
use headroom_core::transforms::{
DiffCompressionResult, DiffCompressor, DiffCompressorConfig, DiffCompressorStats,
detect_content_type as rust_detect_content_type,
is_json_array_of_dicts as rust_is_json_array_of_dicts, ContentType as RustContentType,
DetectionResult as RustDetectionResult, DiffCompressionResult, DiffCompressor,
DiffCompressorConfig, DiffCompressorStats,
};
use pyo3::prelude::*;
use pyo3::types::PyDict;
use pyo3::types::{PyDict, PyString};

/// Identity stub used by the Python smoke test to verify linkage.
#[pyfunction]
Expand Down Expand Up @@ -753,6 +756,109 @@ impl PySmartCrusher {
}
}

// ─── ContentDetector ───────────────────────────────────────────────────────

/// Mirror of `headroom.transforms.content_detector.DetectionResult`.
///
/// Field names + types match the Python dataclass exactly so the existing
/// Python `ContentRouter` (which `import`s `DetectionResult` directly) can
/// continue to read `.content_type`, `.confidence`, and `.metadata` without
/// modification.
///
/// `content_type` is exposed as the lowercase string tag (e.g.
/// `"json_array"`). The Python wrapper translates it back into the
/// `ContentType` enum so the call-site looks identical.
#[pyclass(name = "DetectionResult", module = "headroom._core")]
#[derive(Clone)]
struct PyDetectionResult {
inner: RustDetectionResult,
}

#[pymethods]
impl PyDetectionResult {
#[getter]
fn content_type(&self) -> &'static str {
self.inner.content_type.as_str()
}

#[getter]
fn confidence(&self) -> f64 {
self.inner.confidence
}

/// Per-type metadata bag (e.g. `{"language": "python", "pattern_matches": 5}`
/// for code, `{"item_count": 3, "is_dict_array": true}` for JSON arrays).
/// Returned as a fresh `dict` so callers can mutate without affecting
/// the underlying Rust value.
#[getter]
fn metadata<'py>(&self, py: Python<'py>) -> PyResult<Bound<'py, PyDict>> {
let dict = PyDict::new_bound(py);
for (k, v) in &self.inner.metadata {
// Convert each JSON value into the closest Python primitive.
// Detection metadata is always a flat dict of scalars (ints,
// bools, strings) so we don't need to recurse.
let py_value: PyObject = match v {
serde_json::Value::Bool(b) => b.into_py(py),
serde_json::Value::Number(n) => {
if let Some(i) = n.as_u64() {
i.into_py(py)
} else if let Some(i) = n.as_i64() {
i.into_py(py)
} else if let Some(f) = n.as_f64() {
f.into_py(py)
} else {
py.None()
}
}
serde_json::Value::String(s) => PyString::new_bound(py, s).into_py(py),
serde_json::Value::Null => py.None(),
// Detection never emits arrays / objects in metadata
// today; if it ever does, fall through to JSON-string for
// visibility rather than silently dropping.
other => PyString::new_bound(py, &other.to_string()).into_py(py),
};
dict.set_item(k, py_value)?;
}
Ok(dict)
}

fn __repr__(&self) -> String {
format!(
"DetectionResult(content_type={:?}, confidence={}, metadata=<{} keys>)",
self.inner.content_type.as_str(),
self.inner.confidence,
self.inner.metadata.len()
)
}
}

/// Detect the type of `content`. Returns a `DetectionResult` with the
/// same field surface as Python's dataclass.
///
/// Releases the GIL while detecting — pattern matching can be substantial
/// on large bodies (HTML scans, 500-line diff windows), and freeing the
/// GIL lets other Python threads make progress in the meantime.
#[pyfunction]
fn detect_content_type(py: Python<'_>, content: &str) -> PyDetectionResult {
let owned = content.to_string();
let result = py.allow_threads(move || rust_detect_content_type(&owned));
PyDetectionResult { inner: result }
}

/// Quick check: is `content` a JSON array of dictionaries (the format
/// `SmartCrusher` natively handles)?
#[pyfunction]
fn is_json_array_of_dicts(py: Python<'_>, content: &str) -> bool {
let owned = content.to_string();
py.allow_threads(move || rust_is_json_array_of_dicts(&owned))
}

// Suppress unused-import warning when ContentType isn't referenced
// directly — `as_str()` is the public surface.
const _: fn() = || {
let _ = RustContentType::PlainText;
};

// ─── Module init ───────────────────────────────────────────────────────────

#[pymodule]
Expand All @@ -765,5 +871,8 @@ fn _core(m: &Bound<'_, PyModule>) -> PyResult<()> {
m.add_class::<PySmartCrusherConfig>()?;
m.add_class::<PyCrushResult>()?;
m.add_class::<PySmartCrusher>()?;
m.add_class::<PyDetectionResult>()?;
m.add_function(wrap_pyfunction!(detect_content_type, m)?)?;
m.add_function(wrap_pyfunction!(is_json_array_of_dicts, m)?)?;
Ok(())
}
15 changes: 15 additions & 0 deletions tests/parity/fixtures/content_detector/247811aecfdec556.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
{
"config": {},
"input": "diff --combined merged.py\nindex aaa..bbb..ccc 100644\n--- a/merged.py\n+++ b/merged.py\n@@@ -1,4 -1,4 +1,5 @@@\n unchanged\n- branch_a_only\n -branch_b_only\n++merge_added\n",
"input_sha256": "247811aecfdec55647a37c088c611f8f093119d95335da3d10aa317455159655",
"output": {
"confidence": 1.0,
"content_type": "diff",
"metadata": {
"change_lines": 1,
"header_matches": 3
}
},
"recorded_at": "2026-04-28T06:39:34.959992+00:00",
"transform": "content_detector"
}
15 changes: 15 additions & 0 deletions tests/parity/fixtures/content_detector/37c86907c057e293.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
{
"config": {},
"input": "package com.example;\n\npublic class Foo {\n @Override\n public String toString() { return \"foo\"; }\n}\n\nprivate interface Bar {}\nprotected enum Baz { A, B }\n",
"input_sha256": "37c86907c057e293385ffaef6c4278a4b903fca82e3d52fb42df601f47010a3d",
"output": {
"confidence": 0.7857142857142857,
"content_type": "source_code",
"metadata": {
"language": "java",
"pattern_matches": 5
}
},
"recorded_at": "2026-04-28T06:39:34.962175+00:00",
"transform": "content_detector"
}
12 changes: 12 additions & 0 deletions tests/parity/fixtures/content_detector/3a0b14d2dea7c876.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
{
"config": {},
"input": "Just some prose text without any structure or special markers.",
"input_sha256": "3a0b14d2dea7c87688fc40482a02f006accfae1da50f0734a1e395fed13fc744",
"output": {
"confidence": 0.5,
"content_type": "text",
"metadata": {}
},
"recorded_at": "2026-04-28T06:39:34.962324+00:00",
"transform": "content_detector"
}
12 changes: 12 additions & 0 deletions tests/parity/fixtures/content_detector/3ba74a1f57c5351e.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
{
"config": {},
"input": "<div>a</div>\n<span>b</span>\n<script>x()</script>\n<style>y</style>",
"input_sha256": "3ba74a1f57c5351ecb47efcfff73080bc3f96a75589b459e20424fe0c715be39",
"output": {
"confidence": 0.5,
"content_type": "text",
"metadata": {}
},
"recorded_at": "2026-04-28T06:39:34.960293+00:00",
"transform": "content_detector"
}
12 changes: 12 additions & 0 deletions tests/parity/fixtures/content_detector/4efe946303371867.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
{
"config": {},
"input": "",
"input_sha256": "4efe946303371867389009a0144a8ea235bfe9f6e87a9395b0c0453b8e3f99c0",
"output": {
"confidence": 0.0,
"content_type": "text",
"metadata": {}
},
"recorded_at": "2026-04-28T06:39:34.962454+00:00",
"transform": "content_detector"
}
15 changes: 15 additions & 0 deletions tests/parity/fixtures/content_detector/53400da2b7dd2428.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
{
"config": {},
"input": "import x from 'y';\nexport const foo = 1;\nfunction bar() { return 42; }\nconst f = async function() {};\nmodule.exports = { foo, bar };\n",
"input_sha256": "53400da2b7dd2428c8146088e60779a8d61c2430a2c7c95a6d2b07e83b9b726e",
"output": {
"confidence": 0.9,
"content_type": "source_code",
"metadata": {
"language": "javascript",
"pattern_matches": 5
}
},
"recorded_at": "2026-04-28T06:39:34.961426+00:00",
"transform": "content_detector"
}
15 changes: 15 additions & 0 deletions tests/parity/fixtures/content_detector/552d30c30c2a6793.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
{
"config": {},
"input": "diff --git a/foo.py b/foo.py\nindex abc..def 100644\n--- a/foo.py\n+++ b/foo.py\n@@ -1,3 +1,3 @@\n-old line\n+new line\n unchanged\n",
"input_sha256": "552d30c30c2a67939857c22f85696406f1557476912b41d9918031a15364163a",
"output": {
"confidence": 1.0,
"content_type": "diff",
"metadata": {
"change_lines": 2,
"header_matches": 3
}
},
"recorded_at": "2026-04-28T06:39:34.959842+00:00",
"transform": "content_detector"
}
15 changes: 15 additions & 0 deletions tests/parity/fixtures/content_detector/5e8e1bc29f6c71c5.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
{
"config": {},
"input": "import os\nfrom typing import Any\n\n@dataclass\nclass Foo:\n \"\"\"Docstring.\"\"\"\n def bar(self):\n return 42\n\ndef baz():\n pass\n\nif __name__ == '__main__':\n baz()\n",
"input_sha256": "5e8e1bc29f6c71c5b6523dc65f91a615b15e8bd27d633cbcd54293bbe43b6951",
"output": {
"confidence": 0.850909090909091,
"content_type": "source_code",
"metadata": {
"language": "python",
"pattern_matches": 8
}
},
"recorded_at": "2026-04-28T06:39:34.961138+00:00",
"transform": "content_detector"
}
16 changes: 16 additions & 0 deletions tests/parity/fixtures/content_detector/628c8985c40adb7d.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
{
"config": {},
"input": "INFO starting build\nWARN deprecated API used\nERROR compilation failed\nFAILED test_x\nPASSED test_y\n============================================\n",
"input_sha256": "628c8985c40adb7d265eac37436ea45a549ba9ed2b10eb47a1fa7b478ce17e77",
"output": {
"confidence": 0.9500000000000001,
"content_type": "build",
"metadata": {
"error_matches": 3,
"pattern_matches": 6,
"total_lines": 6
}
},
"recorded_at": "2026-04-28T06:39:34.960715+00:00",
"transform": "content_detector"
}
15 changes: 15 additions & 0 deletions tests/parity/fixtures/content_detector/67f4b1a1697c8aa9.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
{
"config": {},
"input": "interface User { id: number; name: string; }\ntype Maybe<T> = T | null;\nenum Color { Red, Green, Blue }\nfunction f(x: number): boolean { return x > 0; }\n",
"input_sha256": "67f4b1a1697c8aa916e4acb0920a92279bbc35efa3849edc9bc2aa8dcff73fb3",
"output": {
"confidence": 0.76,
"content_type": "source_code",
"metadata": {
"language": "typescript",
"pattern_matches": 3
}
},
"recorded_at": "2026-04-28T06:39:34.961598+00:00",
"transform": "content_detector"
}
Loading
Loading