新增数据补全和提交脚本，支持从 job_bundle.json 读取数据并执行 API 请求；实现批量提交功能，记录成功与失败的日志

2026-02-09 22:06:11 +08:00
parent a2d8a774ca
commit c3716d7a6c
6 changed files with 866 additions and 1 deletions
--- a/scripts/py/enricher.py
+++ b/scripts/py/enricher.py
@@ -0,0 +1,186 @@
+"""
+enricher.py — API 数据补全脚本
+
+读取前端生成的 job_bundle.json，执行 enrichment_rules 中的 API 请求，
+将结果合并到 source_data 中，输出最终完整的 JSON 文件。
+
+用法:
+    python enricher.py job_bundle.json
+    python enricher.py job_bundle.json -o enriched_data.json
+    python enricher.py job_bundle.json --concurrency 10
+"""
+
+import argparse
+import asyncio
+import json
+import re
+import sys
+from pathlib import Path
+
+import aiohttp
+
+
+def resolve_path(obj, path: str):
+    """按点号路径从 dict/list 中取值，类似 lodash.get"""
+    keys = path.split(".")
+    current = obj
+    for key in keys:
+        if isinstance(current, dict) and key in current:
+            current = current[key]
+        elif isinstance(current, list):
+            try:
+                current = current[int(key)]
+            except (ValueError, IndexError):
+                return None
+        else:
+            return None
+    return current
+
+
+def render_template(template: str, row: dict) -> str:
+    """将 {{变量名}} 替换为行数据中的对应值"""
+    def replacer(match: re.Match) -> str:
+        key = match.group(1)
+        value = row.get(key, "")
+        return str(value) if value is not None else ""
+
+    return re.sub(r"\{\{(.+?)\}\}", replacer, template)
+
+
+async def fetch_one(
+    session: aiohttp.ClientSession,
+    rule: dict,
+    row: dict,
+    row_index: int,
+    semaphore: asyncio.Semaphore,
+) -> tuple[int, str, object]:
+    """对单行数据执行一条 enrichment rule 的 API 请求"""
+    target_key = rule["target_key"]
+    fallback = rule.get("fallback_value")
+
+    url = render_template(rule["url_template"], row)
+
+    headers = {}
+    if rule.get("headers"):
+        for k, v in rule["headers"].items():
+            headers[k] = render_template(v, row)
+
+    method = rule.get("method", "GET").upper()
+
+    body = None
+    if method == "POST" and rule.get("body_template"):
+        body = render_template(rule["body_template"], row)
+
+    async with semaphore:
+        try:
+            kwargs: dict = {"headers": headers}
+            if body is not None:
+                try:
+                    kwargs["json"] = json.loads(body)
+                except json.JSONDecodeError:
+                    kwargs["data"] = body
+
+            async with session.request(method, url, **kwargs) as resp:
+                if resp.status >= 400:
+                    print(f"  [WARN] Row {row_index} | {target_key} | HTTP {resp.status} <- {url}")
+                    return row_index, target_key, fallback
+
+                data = await resp.json(content_type=None)
+                value = resolve_path(data, rule["response_path"])
+                if value is None:
+                    value = fallback
+                return row_index, target_key, value
+
+        except Exception as e:
+            print(f"  [ERROR] Row {row_index} | {target_key} | {type(e).__name__}: {e}")
+            return row_index, target_key, fallback
+
+
+async def run_enrichments(
+    source_data: list[dict],
+    rules: list[dict],
+    concurrency: int,
+) -> list[dict]:
+    """对所有行执行所有 enrichment rules"""
+    if not rules:
+        print("No enrichment rules configured. Outputting source data as-is.")
+        return source_data
+
+    semaphore = asyncio.Semaphore(concurrency)
+    tasks = []
+
+    total_calls = len(source_data) * len(rules)
+    print(f"Enriching {len(source_data)} rows x {len(rules)} rule(s) = {total_calls} API calls")
+    print(f"Concurrency: {concurrency}")
+    print()
+
+    async with aiohttp.ClientSession() as session:
+        for row_idx, row in enumerate(source_data):
+            for rule in rules:
+                tasks.append(fetch_one(session, rule, row, row_idx, semaphore))
+
+        results = await asyncio.gather(*tasks, return_exceptions=True)
+
+    error_count = 0
+    for result in results:
+        if isinstance(result, Exception):
+            print(f"  [ERROR] Unexpected: {result}")
+            error_count += 1
+            continue
+        row_idx, target_key, value = result
+        source_data[row_idx][target_key] = value
+
+    print()
+    print(f"Done. {total_calls - error_count}/{total_calls} calls succeeded.")
+    return source_data
+
+
+async def main():
+    parser = argparse.ArgumentParser(description="Enrich job_bundle.json with API data and output final JSON.")
+    parser.add_argument("bundle", help="Path to job_bundle.json")
+    parser.add_argument("-o", "--output", type=str, default=None, help="Output file path (default: <bundle>_enriched.json)")
+    parser.add_argument("--concurrency", type=int, default=5, help="Max concurrent API requests (default: 5)")
+    args = parser.parse_args()
+
+    bundle_path = Path(args.bundle)
+    if not bundle_path.exists():
+        print(f"Error: File not found: {bundle_path}")
+        sys.exit(1)
+
+    with open(bundle_path, "r", encoding="utf-8") as f:
+        bundle = json.load(f)
+
+    meta = bundle.get("meta", {})
+    config = bundle.get("config", {})
+    source_data = bundle.get("source_data", [])
+
+    print(f"=== Job Bundle v{meta.get('version', '?')} ===")
+    print(f"Generated: {meta.get('generated_at', '?')}")
+    print(f"Rows: {len(source_data)}")
+    print(f"Static rules: {len(config.get('static_rules', []))}")
+    print(f"Enrichment rules: {len(config.get('enrichment_rules', []))}")
+    print()
+
+    enriched = await run_enrichments(
+        source_data,
+        config.get("enrichment_rules", []),
+        args.concurrency,
+    )
+
+    # 构建输出，保留 submission 配置
+    output = {
+        "meta": meta,
+        "submission": config.get("submission", {}),
+        "data": enriched,
+    }
+
+    output_path = Path(args.output) if args.output else bundle_path.with_name(bundle_path.stem + "_enriched.json")
+    with open(output_path, "w", encoding="utf-8") as f:
+        json.dump(output, f, ensure_ascii=False, indent=2)
+
+    print(f"\nEnriched data saved to: {output_path}")
+    print(f"Next step: python submitter.py {output_path}")
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
--- a/scripts/py/submitter.py
+++ b/scripts/py/submitter.py
@@ -0,0 +1,161 @@
+"""
+submitter.py — 数据提交脚本
+
+读取 enricher.py 输出的 JSON 文件，按批次提交到目标接口。
+记录提交成功和失败的记录到单独的日志文件。
+
+用法:
+    python submitter.py enriched_data.json
+    python submitter.py enriched_data.json --batch-size 100
+    python submitter.py enriched_data.json --url https://api.example.com/import  # 覆盖 URL
+    python submitter.py enriched_data.json --dry-run  # 只输出不提交
+"""
+
+import argparse
+import asyncio
+import json
+import sys
+from datetime import datetime
+from pathlib import Path
+
+import aiohttp
+
+
+async def submit_batch(
+    session: aiohttp.ClientSession,
+    url: str,
+    method: str,
+    batch: list[dict],
+    batch_index: int,
+    total_batches: int,
+) -> tuple[bool, int, str, list[dict]]:
+    """提交一个批次，返回 (success, status_code, response_text, batch_data)"""
+    try:
+        async with session.request(method, url, json=batch) as resp:
+            status = resp.status
+            body = await resp.text()
+            ok = status < 400
+            tag = "OK" if ok else "FAIL"
+            print(f"  Batch {batch_index}/{total_batches}: HTTP {status} {tag} ({len(batch)} records)")
+            return ok, status, body, batch
+    except Exception as e:
+        msg = f"{type(e).__name__}: {e}"
+        print(f"  Batch {batch_index}/{total_batches}: ERROR - {msg} ({len(batch)} records)")
+        return False, 0, msg, batch
+
+
+async def main():
+    parser = argparse.ArgumentParser(description="Submit enriched data to target API in batches.")
+    parser.add_argument("input", help="Path to enriched JSON file (from enricher.py)")
+    parser.add_argument("--url", type=str, default=None, help="Override target URL from config")
+    parser.add_argument("--method", type=str, default=None, choices=["POST", "PUT"], help="Override HTTP method")
+    parser.add_argument("--batch-size", type=int, default=None, help="Override batch size")
+    parser.add_argument("--dry-run", action="store_true", help="Print what would be submitted without sending")
+    args = parser.parse_args()
+
+    input_path = Path(args.input)
+    if not input_path.exists():
+        print(f"Error: File not found: {input_path}")
+        sys.exit(1)
+
+    with open(input_path, "r", encoding="utf-8") as f:
+        payload = json.load(f)
+
+    data = payload.get("data", [])
+    submission = payload.get("submission", {})
+
+    target_url = args.url or submission.get("target_url", "")
+    method = args.method or submission.get("method", "POST")
+    batch_size = args.batch_size or submission.get("batch_size", 50)
+
+    if not target_url:
+        print("Error: No target URL configured. Use --url or set in job bundle.")
+        sys.exit(1)
+
+    print(f"=== Submitter ===")
+    print(f"Input: {input_path} ({len(data)} records)")
+    print(f"Target: {method} {target_url}")
+    print(f"Batch size: {batch_size}")
+    print()
+
+    batches = [data[i : i + batch_size] for i in range(0, len(data), batch_size)]
+    total_batches = len(batches)
+
+    if args.dry_run:
+        print(f"[DRY RUN] Would submit {len(data)} records in {total_batches} batch(es)")
+        for i, batch in enumerate(batches, 1):
+            print(f"  Batch {i}/{total_batches}: {len(batch)} records")
+        print("\nDry run complete. No data was sent.")
+        return
+
+    success_records: list[dict] = []
+    failed_records: list[dict] = []
+    failed_details: list[dict] = []
+
+    print(f"Submitting {len(data)} records in {total_batches} batch(es)...\n")
+
+    async with aiohttp.ClientSession() as session:
+        for i, batch in enumerate(batches, 1):
+            ok, status, body, batch_data = await submit_batch(
+                session, target_url, method, batch, i, total_batches
+            )
+            if ok:
+                success_records.extend(batch_data)
+            else:
+                failed_records.extend(batch_data)
+                failed_details.append({
+                    "batch_index": i,
+                    "status": status,
+                    "response": body[:500],
+                    "record_count": len(batch_data),
+                })
+
+    # 结果统计
+    print()
+    print(f"=== Result ===")
+    print(f"Success: {len(success_records)} records")
+    print(f"Failed:  {len(failed_records)} records")
+
+    # 写入日志文件
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    base_name = input_path.stem
+
+    if success_records:
+        success_file = input_path.with_name(f"{base_name}_success_{timestamp}.json")
+        with open(success_file, "w", encoding="utf-8") as f:
+            json.dump(success_records, f, ensure_ascii=False, indent=2)
+        print(f"\nSuccess log: {success_file}")
+
+    if failed_records:
+        failed_file = input_path.with_name(f"{base_name}_failed_{timestamp}.json")
+        report = {
+            "summary": {
+                "total_failed": len(failed_records),
+                "failed_batches": len(failed_details),
+                "target_url": target_url,
+                "timestamp": timestamp,
+            },
+            "batch_errors": failed_details,
+            "failed_records": failed_records,
+        }
+        with open(failed_file, "w", encoding="utf-8") as f:
+            json.dump(report, f, ensure_ascii=False, indent=2)
+        print(f"Failed log:  {failed_file}")
+        print(f"\nTo retry failed records:")
+        print(f'  python submitter.py {failed_file} --url "{target_url}"')
+        # 生成一个可直接重试的文件
+        retry_file = input_path.with_name(f"{base_name}_retry_{timestamp}.json")
+        retry_payload = {
+            "submission": submission,
+            "data": failed_records,
+        }
+        with open(retry_file, "w", encoding="utf-8") as f:
+            json.dump(retry_payload, f, ensure_ascii=False, indent=2)
+        print(f"  or: python submitter.py {retry_file}")
+
+    if not success_records and not failed_records:
+        print("\nNo records to submit.")
+
+
+if __name__ == "__main__":
+    asyncio.run(main())