From 01ce41eee5ca0f294c9954e2a054471087b06286 Mon Sep 17 00:00:00 2001 From: JeffreyChen Date: Fri, 26 Jun 2026 10:23:11 +0800 Subject: [PATCH] Add element_proposal: propose a clean element list from raw pixels Set-of-Marks/observation/grounding assume you already have element boxes, but a game/custom-drawn app/remote desktop has no accessibility tree. propose_elements builds the top-of-funnel list from pixels: widget blobs (Canny+morphology+connected_boxes) + text regions, fused via element_parse (ocr>icon priority is the drop-widget-that-is-really-text cross-check), reading-ordered and tagged text/widget. tag_kinds is the pure labeller. --- WHATS_NEW.md | 6 ++ .../doc/new_features/v220_features_doc.rst | 51 +++++++++++ .../Zh/doc/new_features/v220_features_doc.rst | 42 +++++++++ je_auto_control/__init__.py | 3 + .../gui/script_builder/command_schema.py | 19 +++++ .../utils/element_proposal/__init__.py | 6 ++ .../element_proposal/element_proposal.py | 80 +++++++++++++++++ .../utils/executor/action_executor.py | 19 +++++ .../utils/mcp_server/tools/_factories.py | 25 ++++++ .../utils/mcp_server/tools/_handlers.py | 12 +++ .../headless/test_element_proposal_batch.py | 85 +++++++++++++++++++ 11 files changed, 348 insertions(+) create mode 100644 docs/source/Eng/doc/new_features/v220_features_doc.rst create mode 100644 docs/source/Zh/doc/new_features/v220_features_doc.rst create mode 100644 je_auto_control/utils/element_proposal/__init__.py create mode 100644 je_auto_control/utils/element_proposal/element_proposal.py create mode 100644 test/unit_test/headless/test_element_proposal_batch.py diff --git a/WHATS_NEW.md b/WHATS_NEW.md index 83c0dbe3..18ba3291 100644 --- a/WHATS_NEW.md +++ b/WHATS_NEW.md @@ -2,6 +2,12 @@ ## What's new (2026-06-26) +### Template-Free Element Proposal (Pixels to Elements) + +Get a clean numbered element list straight from the screen when there's no accessibility tree. Full reference: [`docs/source/Eng/doc/new_features/v220_features_doc.rst`](docs/source/Eng/doc/new_features/v220_features_doc.rst). + +- **`propose_elements` / `tag_kinds`** (`AC_propose_elements`, `AC_tag_kinds`): Set-of-Marks, `observation` and the grounding helpers all assume you already have element boxes — but a game, a custom-drawn app or a remote desktop has no accessibility tree. `propose_elements` builds that top-of-funnel list from pixels: detect widget boxes (closed-edge blobs via Canny + morphology + `connected_boxes`) and text boxes (`text_regions.find_text_regions`), fuse them — the `element_parse` `ocr > icon` priority *is* the "drop widget-that-is-really-text" cross-check — and return them in reading order, each tagged `text` or `widget`. `tag_kinds` is the pure labeller. cv2 imported lazily; the labeller is fully testable. Seventh and final feature of the ROUND-15 perception lane. No `PySide6`. + ### Classify a Widget from Its Pixel Shape Tell a checkbox from a radio button from a text field — from pixels, no model. Full reference: [`docs/source/Eng/doc/new_features/v219_features_doc.rst`](docs/source/Eng/doc/new_features/v219_features_doc.rst). diff --git a/docs/source/Eng/doc/new_features/v220_features_doc.rst b/docs/source/Eng/doc/new_features/v220_features_doc.rst new file mode 100644 index 00000000..17956790 --- /dev/null +++ b/docs/source/Eng/doc/new_features/v220_features_doc.rst @@ -0,0 +1,51 @@ +Template-Free Element Proposal (Pixels to Elements) +=================================================== + +Set-of-Marks, ``observation`` and the grounding helpers all assume you already +have a list of element boxes — but on a screen the framework doesn't model +(a game, a custom-drawn app, a remote desktop) there is no accessibility tree to +provide one. ``element_proposal`` builds that top-of-funnel list from pixels: +detect candidate *widget* boxes (closed-edge blobs) and *text* boxes +(:func:`text_regions.find_text_regions`), fuse them — dropping widget boxes that +are really just text — and return them in reading order, each tagged ``text`` or +``widget``. + +* :func:`propose_elements` — the full pixel-to-elements pipeline. +* :func:`tag_kinds` — pure: label fused boxes ``text`` / ``widget`` by source and + keep their reading-order ``index``. + +The fusion / cross-check / ordering reuse :mod:`element_parse` — the ``ocr`` > +``icon`` source priority *is* the "drop widget-that-is-really-text" check — and +the text detection reuses :mod:`text_regions`. ``cv2`` is imported lazily so the +module stays importable; :func:`tag_kinds` is pure and fully testable. Imports no +``PySide6``. + +Headless API +------------ + +.. code-block:: python + + from je_auto_control import propose_elements, mark_elements + + # No accessibility tree? Propose elements straight from the screen: + elements = propose_elements(min_area=120) + # [{'box': [x, y, w, h], 'kind': 'widget', 'index': 0}, ...] + + # Feed them to Set-of-Marks like any other element list: + marks = mark_elements(elements) + +``propose_elements`` returns ``[{box, kind, index}]`` in reading order, where +``kind`` is ``text`` or ``widget``. It is the missing top-of-funnel for the +agent stack on un-modelled UIs: pixels in, a clean numbered element list out, +ready for marking, observation or grounding. Tune ``min_area`` for the smallest +control you care about and ``iou_threshold`` for how aggressively overlapping +text and widget boxes are merged. + +Executor commands +----------------- + +``AC_propose_elements`` (``region`` ``[x, y, w, h]`` / ``min_area`` / +``iou_threshold`` → ``{elements}``) runs the full pipeline on the screen, and +``AC_tag_kinds`` (``elements`` JSON list → ``{elements}``, pure) labels a +pre-fused list. They are the matching read-only ``ac_*`` MCP tools and Script +Builder commands under **Image**. diff --git a/docs/source/Zh/doc/new_features/v220_features_doc.rst b/docs/source/Zh/doc/new_features/v220_features_doc.rst new file mode 100644 index 00000000..45554aff --- /dev/null +++ b/docs/source/Zh/doc/new_features/v220_features_doc.rst @@ -0,0 +1,42 @@ +免模板元素提案(像素到元素) +============================ + +Set-of-Marks、``observation`` 與 grounding 輔助函式都假設你已有一份元素方框清單——但在框架無法 +建模的畫面上(遊戲、自繪 app、遠端桌面),並沒有無障礙樹可提供。``element_proposal`` 從像素建立 +這份漏斗頂端清單:偵測候選*控制項*方框(封閉邊緣 blob)與*文字*方框 +(:func:`text_regions.find_text_regions`),將兩者融合——丟棄其實只是文字的控制項方框—— +並依閱讀順序回傳,每個標記為 ``text`` 或 ``widget``。 + +* :func:`propose_elements` ——完整的像素到元素管線。 +* :func:`tag_kinds` ——純函式:依來源把融合後的方框標記 ``text`` / ``widget``,並保留其閱讀順序 ``index``。 + +融合 / 交叉檢查 / 排序重用 :mod:`element_parse`——``ocr`` > ``icon`` 來源優先序*即*「丟棄其實是 +文字的控制項」檢查——文字偵測則重用 :mod:`text_regions`。``cv2`` 採延遲匯入,故模組仍可匯入; +:func:`tag_kinds` 為純函式且可完整測試。不匯入 ``PySide6``。 + +無頭 API +-------- + +.. code-block:: python + + from je_auto_control import propose_elements, mark_elements + + # 沒有無障礙樹?直接從畫面提案元素: + elements = propose_elements(min_area=120) + # [{'box': [x, y, w, h], 'kind': 'widget', 'index': 0}, ...] + + # 像任何元素清單一樣餵給 Set-of-Marks: + marks = mark_elements(elements) + +``propose_elements`` 依閱讀順序回傳 ``[{box, kind, index}]``,``kind`` 為 ``text`` 或 ``widget``。 +它是 agent 堆疊在未建模 UI 上缺少的漏斗頂端:像素進、乾淨的編號元素清單出,可供標記、observation +或 grounding。以 ``min_area`` 調整你在意的最小控制項,以 ``iou_threshold`` 調整重疊文字與控制項 +方框合併的積極程度。 + +執行器指令 +---------- + +``AC_propose_elements``(``region`` ``[x, y, w, h]`` / ``min_area`` / +``iou_threshold`` → ``{elements}``)在畫面上執行完整管線,``AC_tag_kinds`` +(``elements`` JSON 清單 → ``{elements}``,純函式)則標記預先融合的清單。皆以對應的唯讀 +``ac_*`` MCP 工具及 Script Builder 指令(位於 **Image** 分類下)形式提供。 diff --git a/je_auto_control/__init__.py b/je_auto_control/__init__.py index 54b3f178..1ccf1d6d 100644 --- a/je_auto_control/__init__.py +++ b/je_auto_control/__init__.py @@ -149,6 +149,8 @@ from je_auto_control.utils.icon_classify import ( box_features, classify_icon, classify_widget, ) +# Propose a clean element list from raw pixels (template-free) +from je_auto_control.utils.element_proposal import propose_elements, tag_kinds # Rich clipboard formats — RTF + CSV/TSV codecs and Windows get / set from je_auto_control.utils.clipboard_rich_formats import ( build_rtf, csv_to_rows, get_clipboard_csv, get_clipboard_rtf, rows_to_csv, @@ -1779,6 +1781,7 @@ def start_autocontrol_gui(*args, **kwargs): "normalize_theme", "match_theme", "localize_changes", "rank_changes", "classify_widget", "box_features", "classify_icon", + "propose_elements", "tag_kinds", "build_rtf", "rtf_to_text", "rows_to_csv", "csv_to_rows", "set_clipboard_rtf", "get_clipboard_rtf", "set_clipboard_csv", "get_clipboard_csv", diff --git a/je_auto_control/gui/script_builder/command_schema.py b/je_auto_control/gui/script_builder/command_schema.py index f642c163..50e2031b 100644 --- a/je_auto_control/gui/script_builder/command_schema.py +++ b/je_auto_control/gui/script_builder/command_schema.py @@ -4648,6 +4648,25 @@ def _add_work_queue_specs(specs: List[CommandSpec]) -> None: ), description="Classify the widget in an image box from its pixels.", )) + specs.append(CommandSpec( + "AC_propose_elements", "Image", "Propose Elements (template-free)", + fields=( + FieldSpec("region", FieldType.STRING, optional=True, + placeholder="[x, y, w, h]"), + FieldSpec("min_area", FieldType.INT, optional=True, default=80), + FieldSpec("iou_threshold", FieldType.FLOAT, optional=True, + default=0.5), + ), + description="Propose text/widget element boxes from raw screen pixels.", + )) + specs.append(CommandSpec( + "AC_tag_kinds", "Image", "Tag Element Kinds", + fields=( + FieldSpec("elements", FieldType.STRING, + placeholder="JSON list of fused boxes"), + ), + description="Label fused element boxes text/widget by source.", + )) specs.append(CommandSpec( "AC_normalize_ext", "Shell", "Normalize Extension", fields=( diff --git a/je_auto_control/utils/element_proposal/__init__.py b/je_auto_control/utils/element_proposal/__init__.py new file mode 100644 index 00000000..103cb9f0 --- /dev/null +++ b/je_auto_control/utils/element_proposal/__init__.py @@ -0,0 +1,6 @@ +"""Propose a clean element list from raw pixels, with no template or model.""" +from je_auto_control.utils.element_proposal.element_proposal import ( + propose_elements, tag_kinds, +) + +__all__ = ["propose_elements", "tag_kinds"] diff --git a/je_auto_control/utils/element_proposal/element_proposal.py b/je_auto_control/utils/element_proposal/element_proposal.py new file mode 100644 index 00000000..3b08b274 --- /dev/null +++ b/je_auto_control/utils/element_proposal/element_proposal.py @@ -0,0 +1,80 @@ +"""Propose a clean element list from raw pixels, with no template or model. + +Set-of-Marks, ``observation`` and the grounding helpers all assume you already +have a list of element boxes — but on a screen the framework doesn't model +(a game, a custom-drawn app, a remote desktop) there is no accessibility tree to +provide one. ``element_proposal`` builds that top-of-funnel list from pixels: +detect candidate *widget* boxes (closed-edge blobs) and *text* boxes +(:func:`text_regions.find_text_regions`), fuse them — dropping widget boxes that +are really just text — and return them in reading order, each tagged ``text`` or +``widget``. + +* :func:`propose_elements` — the full pixel-to-elements pipeline. +* :func:`tag_kinds` — pure: label fused boxes ``text`` / ``widget`` by source and + keep their reading-order ``index``. + +The fusion / cross-check / ordering reuse :mod:`element_parse` (the ``ocr`` > +``icon`` priority *is* the "drop widget-that-is-really-text" check) and +:mod:`text_regions`; ``cv2`` is imported lazily so the module stays importable. +:func:`tag_kinds` is pure and fully testable. Imports no ``PySide6``. +""" +from typing import Any, Dict, List, Optional, Sequence + +# Reading-order source tag to element kind. +_KIND_BY_SOURCE = {"ocr": "text", "icon": "widget", "a11y": "element"} + + +def tag_kinds(elements: Sequence[Dict[str, Any]]) -> List[Dict[str, Any]]: + """Label fused boxes ``text`` / ``widget`` by source (pure). + + Each input box carries a ``source`` (``ocr`` / ``icon``) and an ``index`` + from :func:`element_parse.reading_order`. Returns ``[{box, kind, index}]``. + """ + result: List[Dict[str, Any]] = [] + for element in elements: + box = [int(element["x"]), int(element["y"]), + int(element["width"]), int(element["height"])] + kind = _KIND_BY_SOURCE.get(element.get("source"), "widget") + result.append({"box": box, "kind": kind, "index": element.get("index")}) + return result + + +def _reasonable(box: Dict[str, Any], frame_w: int, frame_h: int) -> bool: + """Keep plausibly-widget blobs: not the whole frame, not a thin rule.""" + width, height = int(box["width"]), int(box["height"]) + if width >= 0.95 * frame_w and height >= 0.95 * frame_h: + return False + aspect = width / height if height else 0.0 + return 0.05 <= aspect <= 15.0 + + +def _widget_boxes(gray: Any, min_area: int) -> List[Dict[str, Any]]: + """Detect candidate widget boxes as closed-edge blobs (cv2).""" + import cv2 + from je_auto_control.utils.cv2_utils.blobs import connected_boxes + edges = cv2.Canny(gray, 50, 150) + kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 5)) + closed = cv2.morphologyEx(edges, cv2.MORPH_CLOSE, kernel) + height, width = gray.shape[:2] + return [box for box in connected_boxes(closed, min_area=int(min_area)) + if _reasonable(box, width, height)] + + +def propose_elements(source: Optional[Any] = None, *, + region: Optional[Sequence[int]] = None, min_area: int = 80, + iou_threshold: float = 0.5) -> List[Dict[str, Any]]: + """Propose ``text`` / ``widget`` element boxes from pixels, in reading order. + + Detects widget blobs and text regions on ``source`` (a fresh screen grab of + ``region`` by default), fuses them (overlapping text wins over widget), and + orders them. Returns ``[{box, kind, index}]``. + """ + from je_auto_control.utils.element_parse import fuse_elements, reading_order + from je_auto_control.utils.text_regions import find_text_regions + from je_auto_control.utils.visual_match.visual_match import _haystack_gray + gray = _haystack_gray(source, region) + text = find_text_regions(gray, min_area=int(min_area)) + widgets = _widget_boxes(gray, int(min_area)) + fused = fuse_elements(ocr_boxes=text, icon_boxes=widgets, + iou_threshold=float(iou_threshold)) + return tag_kinds(reading_order(fused)) diff --git a/je_auto_control/utils/executor/action_executor.py b/je_auto_control/utils/executor/action_executor.py index 62168052..7f5a8761 100644 --- a/je_auto_control/utils/executor/action_executor.py +++ b/je_auto_control/utils/executor/action_executor.py @@ -2946,6 +2946,23 @@ def _classify_icon(source: Any, box: Any) -> Dict[str, Any]: return classify_icon(str(source), _coerce_list(box)) +def _propose_elements(region: Any = None, min_area: Any = 80, + iou_threshold: Any = 0.5) -> Dict[str, Any]: + """Adapter: propose text/widget element boxes from pixels (device).""" + from je_auto_control.utils.element_proposal import propose_elements + elements = propose_elements(region=_coerce_region(region), + min_area=int(min_area), + iou_threshold=float(iou_threshold)) + return {"elements": elements} + + +def _tag_kinds(elements: Any) -> Dict[str, Any]: + """Adapter: label fused boxes text/widget by source (pure).""" + from je_auto_control.utils.element_proposal import tag_kinds + items = _coerce_list(elements) if elements else [] + return {"elements": tag_kinds(items)} + + def _normalize_ext(target: str) -> Dict[str, Any]: """Adapter: the lowercased extension of a path / bare ext (pure).""" from je_auto_control.utils.file_assoc import normalize_ext @@ -6989,6 +7006,8 @@ def __init__(self): "AC_localize_changes": _localize_changes, "AC_classify_widget": _classify_widget, "AC_classify_icon": _classify_icon, + "AC_propose_elements": _propose_elements, + "AC_tag_kinds": _tag_kinds, "AC_normalize_ext": _normalize_ext, "AC_file_association": _file_association, "AC_get_control_text": _get_control_text, diff --git a/je_auto_control/utils/mcp_server/tools/_factories.py b/je_auto_control/utils/mcp_server/tools/_factories.py index 3b11e843..26c2427e 100644 --- a/je_auto_control/utils/mcp_server/tools/_factories.py +++ b/je_auto_control/utils/mcp_server/tools/_factories.py @@ -4159,6 +4159,31 @@ def img_histogram_tools() -> List[MCPTool]: handler=h.classify_icon, annotations=READ_ONLY, ), + MCPTool( + name="ac_propose_elements", + description=("Propose text/widget element boxes from raw screen " + "pixels (template-free): detect widget blobs + text " + "regions, fuse, order. 'region' [x,y,w,h] clips. " + "Returns {elements:[{box, kind, index}]}."), + input_schema=schema({"region": {"type": "array", + "items": {"type": "integer"}}, + "min_area": {"type": "integer"}, + "iou_threshold": {"type": "number"}}), + handler=h.propose_elements, + annotations=READ_ONLY, + ), + MCPTool( + name="ac_tag_kinds", + description=("Label fused element boxes 'text'/'widget' by source " + "(pure). 'elements' is a list of {x,y,width,height," + "source,index}. Returns {elements:[{box, kind, " + "index}]}."), + input_schema=schema({"elements": {"type": "array", + "items": {"type": "object"}}}, + required=["elements"]), + handler=h.tag_kinds, + annotations=READ_ONLY, + ), ] diff --git a/je_auto_control/utils/mcp_server/tools/_handlers.py b/je_auto_control/utils/mcp_server/tools/_handlers.py index 66875681..91d38a68 100644 --- a/je_auto_control/utils/mcp_server/tools/_handlers.py +++ b/je_auto_control/utils/mcp_server/tools/_handlers.py @@ -797,6 +797,18 @@ def classify_icon(source, box): return _classify_icon(source, box) +def propose_elements(region=None, min_area=80, iou_threshold=0.5): + from je_auto_control.utils.executor.action_executor import ( + _propose_elements, + ) + return _propose_elements(region, min_area, iou_threshold) + + +def tag_kinds(elements): + from je_auto_control.utils.executor.action_executor import _tag_kinds + return _tag_kinds(elements) + + def normalize_ext(target): from je_auto_control.utils.executor.action_executor import _normalize_ext return _normalize_ext(target) diff --git a/test/unit_test/headless/test_element_proposal_batch.py b/test/unit_test/headless/test_element_proposal_batch.py new file mode 100644 index 00000000..c88fe950 --- /dev/null +++ b/test/unit_test/headless/test_element_proposal_batch.py @@ -0,0 +1,85 @@ +"""Headless tests for element_proposal (pure tag_kinds + cv2 pipeline).""" +import pytest + +import je_auto_control as ac +from je_auto_control.utils.element_proposal import propose_elements, tag_kinds + + +# --- pure tag_kinds ------------------------------------------------------- + +def test_tag_kinds_labels_by_source(): + fused = [ + {"x": 0, "y": 0, "width": 30, "height": 12, "source": "ocr", + "index": 0}, + {"x": 0, "y": 20, "width": 16, "height": 16, "source": "icon", + "index": 1}, + ] + tagged = tag_kinds(fused) + assert tagged[0] == {"box": [0, 0, 30, 12], "kind": "text", "index": 0} + assert tagged[1] == {"box": [0, 20, 16, 16], "kind": "widget", "index": 1} + + +def test_tag_kinds_unknown_source_is_widget(): + tagged = tag_kinds([{"x": 1, "y": 2, "width": 3, "height": 4}]) + assert tagged[0]["kind"] == "widget" + assert tagged[0]["box"] == [1, 2, 3, 4] + + +def test_tag_kinds_empty(): + assert tag_kinds([]) == [] + + +# --- cv2 propose_elements (per-function importorskip) --------------------- + +def test_propose_elements_finds_widgets(): + np = pytest.importorskip("numpy") + cv2 = pytest.importorskip("cv2") + canvas = np.full((200, 240), 245, dtype="uint8") + # three distinct outlined "widgets" + cv2.rectangle(canvas, (20, 20), (90, 60), 0, 2) + cv2.rectangle(canvas, (130, 30), (210, 70), 0, 2) + cv2.rectangle(canvas, (40, 110), (200, 160), 0, 2) + elements = propose_elements(canvas, min_area=120) + assert len(elements) >= 2 + # every element is well-formed and in reading order + for position, element in enumerate(elements): + assert set(element) == {"box", "kind", "index"} + assert element["index"] == position + assert element["kind"] in ("text", "widget") + assert len(element["box"]) == 4 + # nothing spans the whole frame + assert all(not (e["box"][2] >= 228 and e["box"][3] >= 190) + for e in elements) + + +def test_propose_elements_blank_screen_is_empty_or_small(): + np = pytest.importorskip("numpy") + pytest.importorskip("cv2") + blank = np.full((120, 120), 255, dtype="uint8") + assert propose_elements(blank, min_area=200) == [] + + +# --- wiring (cv2-free) ---------------------------------------------------- + +def test_executor_pure_tag_path(): + from je_auto_control.utils.executor.action_executor import _tag_kinds + out = _tag_kinds('[{"x":0,"y":0,"width":10,"height":10,"source":"icon"}]') + assert out["elements"][0]["kind"] == "widget" + + +def test_wiring(): + known = set(ac.executor.known_commands()) + assert {"AC_propose_elements", "AC_tag_kinds"} <= known + from je_auto_control.utils.mcp_server.tools import ( + build_default_tool_registry, + ) + names = {t.name for t in build_default_tool_registry()} + assert {"ac_propose_elements", "ac_tag_kinds"} <= names + from je_auto_control.gui.script_builder.command_schema import _build_specs + specs = {s.command for s in _build_specs()} + assert {"AC_propose_elements", "AC_tag_kinds"} <= specs + + +def test_facade_exports(): + for name in ("propose_elements", "tag_kinds"): + assert hasattr(ac, name) and name in ac.__all__