新增数据补全和提交脚本，支持从 job_bundle.json 读取数据并执行 API 请求；实现批量提交功能，记录成功与失败的日志

2026-02-09 22:06:11 +08:00
parent a2d8a774ca
commit c3716d7a6c
6 changed files with 866 additions and 1 deletions
--- a/scripts/py/enricher.py
+++ b/scripts/py/enricher.py
@@ -0,0 +1,186 @@
+"""
+enricher.py — API 数据补全脚本
+
+读取前端生成的 job_bundle.json，执行 enrichment_rules 中的 API 请求，
+将结果合并到 source_data 中，输出最终完整的 JSON 文件。
+
+用法:
+    python enricher.py job_bundle.json
+    python enricher.py job_bundle.json -o enriched_data.json
+    python enricher.py job_bundle.json --concurrency 10
+"""
+
+import argparse
+import asyncio
+import json
+import re
+import sys
+from pathlib import Path
+
+import aiohttp
+
+
+def resolve_path(obj, path: str):
+    """按点号路径从 dict/list 中取值，类似 lodash.get"""
+    keys = path.split(".")
+    current = obj
+    for key in keys:
+        if isinstance(current, dict) and key in current:
+            current = current[key]
+        elif isinstance(current, list):
+            try:
+                current = current[int(key)]
+            except (ValueError, IndexError):
+                return None
+        else:
+            return None
+    return current
+
+
+def render_template(template: str, row: dict) -> str:
+    """将 {{变量名}} 替换为行数据中的对应值"""
+    def replacer(match: re.Match) -> str:
+        key = match.group(1)
+        value = row.get(key, "")
+        return str(value) if value is not None else ""
+
+    return re.sub(r"\{\{(.+?)\}\}", replacer, template)
+
+
+async def fetch_one(
+    session: aiohttp.ClientSession,
+    rule: dict,
+    row: dict,
+    row_index: int,
+    semaphore: asyncio.Semaphore,
+) -> tuple[int, str, object]:
+    """对单行数据执行一条 enrichment rule 的 API 请求"""
+    target_key = rule["target_key"]
+    fallback = rule.get("fallback_value")
+
+    url = render_template(rule["url_template"], row)
+
+    headers = {}
+    if rule.get("headers"):
+        for k, v in rule["headers"].items():
+            headers[k] = render_template(v, row)
+
+    method = rule.get("method", "GET").upper()
+
+    body = None
+    if method == "POST" and rule.get("body_template"):
+        body = render_template(rule["body_template"], row)
+
+    async with semaphore:
+        try:
+            kwargs: dict = {"headers": headers}
+            if body is not None:
+                try:
+                    kwargs["json"] = json.loads(body)
+                except json.JSONDecodeError:
+                    kwargs["data"] = body
+
+            async with session.request(method, url, **kwargs) as resp:
+                if resp.status >= 400:
+                    print(f"  [WARN] Row {row_index} | {target_key} | HTTP {resp.status} <- {url}")
+                    return row_index, target_key, fallback
+
+                data = await resp.json(content_type=None)
+                value = resolve_path(data, rule["response_path"])
+                if value is None:
+                    value = fallback
+                return row_index, target_key, value
+
+        except Exception as e:
+            print(f"  [ERROR] Row {row_index} | {target_key} | {type(e).__name__}: {e}")
+            return row_index, target_key, fallback
+
+
+async def run_enrichments(
+    source_data: list[dict],
+    rules: list[dict],
+    concurrency: int,
+) -> list[dict]:
+    """对所有行执行所有 enrichment rules"""
+    if not rules:
+        print("No enrichment rules configured. Outputting source data as-is.")
+        return source_data
+
+    semaphore = asyncio.Semaphore(concurrency)
+    tasks = []
+
+    total_calls = len(source_data) * len(rules)
+    print(f"Enriching {len(source_data)} rows x {len(rules)} rule(s) = {total_calls} API calls")
+    print(f"Concurrency: {concurrency}")
+    print()
+
+    async with aiohttp.ClientSession() as session:
+        for row_idx, row in enumerate(source_data):
+            for rule in rules:
+                tasks.append(fetch_one(session, rule, row, row_idx, semaphore))
+
+        results = await asyncio.gather(*tasks, return_exceptions=True)
+
+    error_count = 0
+    for result in results:
+        if isinstance(result, Exception):
+            print(f"  [ERROR] Unexpected: {result}")
+            error_count += 1
+            continue
+        row_idx, target_key, value = result
+        source_data[row_idx][target_key] = value
+
+    print()
+    print(f"Done. {total_calls - error_count}/{total_calls} calls succeeded.")
+    return source_data
+
+
+async def main():
+    parser = argparse.ArgumentParser(description="Enrich job_bundle.json with API data and output final JSON.")
+    parser.add_argument("bundle", help="Path to job_bundle.json")
+    parser.add_argument("-o", "--output", type=str, default=None, help="Output file path (default: <bundle>_enriched.json)")
+    parser.add_argument("--concurrency", type=int, default=5, help="Max concurrent API requests (default: 5)")
+    args = parser.parse_args()
+
+    bundle_path = Path(args.bundle)
+    if not bundle_path.exists():
+        print(f"Error: File not found: {bundle_path}")
+        sys.exit(1)
+
+    with open(bundle_path, "r", encoding="utf-8") as f:
+        bundle = json.load(f)
+
+    meta = bundle.get("meta", {})
+    config = bundle.get("config", {})
+    source_data = bundle.get("source_data", [])
+
+    print(f"=== Job Bundle v{meta.get('version', '?')} ===")
+    print(f"Generated: {meta.get('generated_at', '?')}")
+    print(f"Rows: {len(source_data)}")
+    print(f"Static rules: {len(config.get('static_rules', []))}")
+    print(f"Enrichment rules: {len(config.get('enrichment_rules', []))}")
+    print()
+
+    enriched = await run_enrichments(
+        source_data,
+        config.get("enrichment_rules", []),
+        args.concurrency,
+    )
+
+    # 构建输出，保留 submission 配置
+    output = {
+        "meta": meta,
+        "submission": config.get("submission", {}),
+        "data": enriched,
+    }
+
+    output_path = Path(args.output) if args.output else bundle_path.with_name(bundle_path.stem + "_enriched.json")
+    with open(output_path, "w", encoding="utf-8") as f:
+        json.dump(output, f, ensure_ascii=False, indent=2)
+
+    print(f"\nEnriched data saved to: {output_path}")
+    print(f"Next step: python submitter.py {output_path}")
+
+
+if __name__ == "__main__":
+    asyncio.run(main())