新增数据补全和提交脚本,支持从 job_bundle.json 读取数据并执行 API 请求;实现批量提交功能,记录成功与失败的日志
This commit is contained in:
186
scripts/py/enricher.py
Normal file
186
scripts/py/enricher.py
Normal file
@@ -0,0 +1,186 @@
|
||||
"""
|
||||
enricher.py — API 数据补全脚本
|
||||
|
||||
读取前端生成的 job_bundle.json,执行 enrichment_rules 中的 API 请求,
|
||||
将结果合并到 source_data 中,输出最终完整的 JSON 文件。
|
||||
|
||||
用法:
|
||||
python enricher.py job_bundle.json
|
||||
python enricher.py job_bundle.json -o enriched_data.json
|
||||
python enricher.py job_bundle.json --concurrency 10
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import aiohttp
|
||||
|
||||
|
||||
def resolve_path(obj, path: str):
|
||||
"""按点号路径从 dict/list 中取值,类似 lodash.get"""
|
||||
keys = path.split(".")
|
||||
current = obj
|
||||
for key in keys:
|
||||
if isinstance(current, dict) and key in current:
|
||||
current = current[key]
|
||||
elif isinstance(current, list):
|
||||
try:
|
||||
current = current[int(key)]
|
||||
except (ValueError, IndexError):
|
||||
return None
|
||||
else:
|
||||
return None
|
||||
return current
|
||||
|
||||
|
||||
def render_template(template: str, row: dict) -> str:
|
||||
"""将 {{变量名}} 替换为行数据中的对应值"""
|
||||
def replacer(match: re.Match) -> str:
|
||||
key = match.group(1)
|
||||
value = row.get(key, "")
|
||||
return str(value) if value is not None else ""
|
||||
|
||||
return re.sub(r"\{\{(.+?)\}\}", replacer, template)
|
||||
|
||||
|
||||
async def fetch_one(
|
||||
session: aiohttp.ClientSession,
|
||||
rule: dict,
|
||||
row: dict,
|
||||
row_index: int,
|
||||
semaphore: asyncio.Semaphore,
|
||||
) -> tuple[int, str, object]:
|
||||
"""对单行数据执行一条 enrichment rule 的 API 请求"""
|
||||
target_key = rule["target_key"]
|
||||
fallback = rule.get("fallback_value")
|
||||
|
||||
url = render_template(rule["url_template"], row)
|
||||
|
||||
headers = {}
|
||||
if rule.get("headers"):
|
||||
for k, v in rule["headers"].items():
|
||||
headers[k] = render_template(v, row)
|
||||
|
||||
method = rule.get("method", "GET").upper()
|
||||
|
||||
body = None
|
||||
if method == "POST" and rule.get("body_template"):
|
||||
body = render_template(rule["body_template"], row)
|
||||
|
||||
async with semaphore:
|
||||
try:
|
||||
kwargs: dict = {"headers": headers}
|
||||
if body is not None:
|
||||
try:
|
||||
kwargs["json"] = json.loads(body)
|
||||
except json.JSONDecodeError:
|
||||
kwargs["data"] = body
|
||||
|
||||
async with session.request(method, url, **kwargs) as resp:
|
||||
if resp.status >= 400:
|
||||
print(f" [WARN] Row {row_index} | {target_key} | HTTP {resp.status} <- {url}")
|
||||
return row_index, target_key, fallback
|
||||
|
||||
data = await resp.json(content_type=None)
|
||||
value = resolve_path(data, rule["response_path"])
|
||||
if value is None:
|
||||
value = fallback
|
||||
return row_index, target_key, value
|
||||
|
||||
except Exception as e:
|
||||
print(f" [ERROR] Row {row_index} | {target_key} | {type(e).__name__}: {e}")
|
||||
return row_index, target_key, fallback
|
||||
|
||||
|
||||
async def run_enrichments(
|
||||
source_data: list[dict],
|
||||
rules: list[dict],
|
||||
concurrency: int,
|
||||
) -> list[dict]:
|
||||
"""对所有行执行所有 enrichment rules"""
|
||||
if not rules:
|
||||
print("No enrichment rules configured. Outputting source data as-is.")
|
||||
return source_data
|
||||
|
||||
semaphore = asyncio.Semaphore(concurrency)
|
||||
tasks = []
|
||||
|
||||
total_calls = len(source_data) * len(rules)
|
||||
print(f"Enriching {len(source_data)} rows x {len(rules)} rule(s) = {total_calls} API calls")
|
||||
print(f"Concurrency: {concurrency}")
|
||||
print()
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
for row_idx, row in enumerate(source_data):
|
||||
for rule in rules:
|
||||
tasks.append(fetch_one(session, rule, row, row_idx, semaphore))
|
||||
|
||||
results = await asyncio.gather(*tasks, return_exceptions=True)
|
||||
|
||||
error_count = 0
|
||||
for result in results:
|
||||
if isinstance(result, Exception):
|
||||
print(f" [ERROR] Unexpected: {result}")
|
||||
error_count += 1
|
||||
continue
|
||||
row_idx, target_key, value = result
|
||||
source_data[row_idx][target_key] = value
|
||||
|
||||
print()
|
||||
print(f"Done. {total_calls - error_count}/{total_calls} calls succeeded.")
|
||||
return source_data
|
||||
|
||||
|
||||
async def main():
|
||||
parser = argparse.ArgumentParser(description="Enrich job_bundle.json with API data and output final JSON.")
|
||||
parser.add_argument("bundle", help="Path to job_bundle.json")
|
||||
parser.add_argument("-o", "--output", type=str, default=None, help="Output file path (default: <bundle>_enriched.json)")
|
||||
parser.add_argument("--concurrency", type=int, default=5, help="Max concurrent API requests (default: 5)")
|
||||
args = parser.parse_args()
|
||||
|
||||
bundle_path = Path(args.bundle)
|
||||
if not bundle_path.exists():
|
||||
print(f"Error: File not found: {bundle_path}")
|
||||
sys.exit(1)
|
||||
|
||||
with open(bundle_path, "r", encoding="utf-8") as f:
|
||||
bundle = json.load(f)
|
||||
|
||||
meta = bundle.get("meta", {})
|
||||
config = bundle.get("config", {})
|
||||
source_data = bundle.get("source_data", [])
|
||||
|
||||
print(f"=== Job Bundle v{meta.get('version', '?')} ===")
|
||||
print(f"Generated: {meta.get('generated_at', '?')}")
|
||||
print(f"Rows: {len(source_data)}")
|
||||
print(f"Static rules: {len(config.get('static_rules', []))}")
|
||||
print(f"Enrichment rules: {len(config.get('enrichment_rules', []))}")
|
||||
print()
|
||||
|
||||
enriched = await run_enrichments(
|
||||
source_data,
|
||||
config.get("enrichment_rules", []),
|
||||
args.concurrency,
|
||||
)
|
||||
|
||||
# 构建输出,保留 submission 配置
|
||||
output = {
|
||||
"meta": meta,
|
||||
"submission": config.get("submission", {}),
|
||||
"data": enriched,
|
||||
}
|
||||
|
||||
output_path = Path(args.output) if args.output else bundle_path.with_name(bundle_path.stem + "_enriched.json")
|
||||
with open(output_path, "w", encoding="utf-8") as f:
|
||||
json.dump(output, f, ensure_ascii=False, indent=2)
|
||||
|
||||
print(f"\nEnriched data saved to: {output_path}")
|
||||
print(f"Next step: python submitter.py {output_path}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
161
scripts/py/submitter.py
Normal file
161
scripts/py/submitter.py
Normal file
@@ -0,0 +1,161 @@
|
||||
"""
|
||||
submitter.py — 数据提交脚本
|
||||
|
||||
读取 enricher.py 输出的 JSON 文件,按批次提交到目标接口。
|
||||
记录提交成功和失败的记录到单独的日志文件。
|
||||
|
||||
用法:
|
||||
python submitter.py enriched_data.json
|
||||
python submitter.py enriched_data.json --batch-size 100
|
||||
python submitter.py enriched_data.json --url https://api.example.com/import # 覆盖 URL
|
||||
python submitter.py enriched_data.json --dry-run # 只输出不提交
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import json
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
import aiohttp
|
||||
|
||||
|
||||
async def submit_batch(
|
||||
session: aiohttp.ClientSession,
|
||||
url: str,
|
||||
method: str,
|
||||
batch: list[dict],
|
||||
batch_index: int,
|
||||
total_batches: int,
|
||||
) -> tuple[bool, int, str, list[dict]]:
|
||||
"""提交一个批次,返回 (success, status_code, response_text, batch_data)"""
|
||||
try:
|
||||
async with session.request(method, url, json=batch) as resp:
|
||||
status = resp.status
|
||||
body = await resp.text()
|
||||
ok = status < 400
|
||||
tag = "OK" if ok else "FAIL"
|
||||
print(f" Batch {batch_index}/{total_batches}: HTTP {status} {tag} ({len(batch)} records)")
|
||||
return ok, status, body, batch
|
||||
except Exception as e:
|
||||
msg = f"{type(e).__name__}: {e}"
|
||||
print(f" Batch {batch_index}/{total_batches}: ERROR - {msg} ({len(batch)} records)")
|
||||
return False, 0, msg, batch
|
||||
|
||||
|
||||
async def main():
|
||||
parser = argparse.ArgumentParser(description="Submit enriched data to target API in batches.")
|
||||
parser.add_argument("input", help="Path to enriched JSON file (from enricher.py)")
|
||||
parser.add_argument("--url", type=str, default=None, help="Override target URL from config")
|
||||
parser.add_argument("--method", type=str, default=None, choices=["POST", "PUT"], help="Override HTTP method")
|
||||
parser.add_argument("--batch-size", type=int, default=None, help="Override batch size")
|
||||
parser.add_argument("--dry-run", action="store_true", help="Print what would be submitted without sending")
|
||||
args = parser.parse_args()
|
||||
|
||||
input_path = Path(args.input)
|
||||
if not input_path.exists():
|
||||
print(f"Error: File not found: {input_path}")
|
||||
sys.exit(1)
|
||||
|
||||
with open(input_path, "r", encoding="utf-8") as f:
|
||||
payload = json.load(f)
|
||||
|
||||
data = payload.get("data", [])
|
||||
submission = payload.get("submission", {})
|
||||
|
||||
target_url = args.url or submission.get("target_url", "")
|
||||
method = args.method or submission.get("method", "POST")
|
||||
batch_size = args.batch_size or submission.get("batch_size", 50)
|
||||
|
||||
if not target_url:
|
||||
print("Error: No target URL configured. Use --url or set in job bundle.")
|
||||
sys.exit(1)
|
||||
|
||||
print(f"=== Submitter ===")
|
||||
print(f"Input: {input_path} ({len(data)} records)")
|
||||
print(f"Target: {method} {target_url}")
|
||||
print(f"Batch size: {batch_size}")
|
||||
print()
|
||||
|
||||
batches = [data[i : i + batch_size] for i in range(0, len(data), batch_size)]
|
||||
total_batches = len(batches)
|
||||
|
||||
if args.dry_run:
|
||||
print(f"[DRY RUN] Would submit {len(data)} records in {total_batches} batch(es)")
|
||||
for i, batch in enumerate(batches, 1):
|
||||
print(f" Batch {i}/{total_batches}: {len(batch)} records")
|
||||
print("\nDry run complete. No data was sent.")
|
||||
return
|
||||
|
||||
success_records: list[dict] = []
|
||||
failed_records: list[dict] = []
|
||||
failed_details: list[dict] = []
|
||||
|
||||
print(f"Submitting {len(data)} records in {total_batches} batch(es)...\n")
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
for i, batch in enumerate(batches, 1):
|
||||
ok, status, body, batch_data = await submit_batch(
|
||||
session, target_url, method, batch, i, total_batches
|
||||
)
|
||||
if ok:
|
||||
success_records.extend(batch_data)
|
||||
else:
|
||||
failed_records.extend(batch_data)
|
||||
failed_details.append({
|
||||
"batch_index": i,
|
||||
"status": status,
|
||||
"response": body[:500],
|
||||
"record_count": len(batch_data),
|
||||
})
|
||||
|
||||
# 结果统计
|
||||
print()
|
||||
print(f"=== Result ===")
|
||||
print(f"Success: {len(success_records)} records")
|
||||
print(f"Failed: {len(failed_records)} records")
|
||||
|
||||
# 写入日志文件
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
base_name = input_path.stem
|
||||
|
||||
if success_records:
|
||||
success_file = input_path.with_name(f"{base_name}_success_{timestamp}.json")
|
||||
with open(success_file, "w", encoding="utf-8") as f:
|
||||
json.dump(success_records, f, ensure_ascii=False, indent=2)
|
||||
print(f"\nSuccess log: {success_file}")
|
||||
|
||||
if failed_records:
|
||||
failed_file = input_path.with_name(f"{base_name}_failed_{timestamp}.json")
|
||||
report = {
|
||||
"summary": {
|
||||
"total_failed": len(failed_records),
|
||||
"failed_batches": len(failed_details),
|
||||
"target_url": target_url,
|
||||
"timestamp": timestamp,
|
||||
},
|
||||
"batch_errors": failed_details,
|
||||
"failed_records": failed_records,
|
||||
}
|
||||
with open(failed_file, "w", encoding="utf-8") as f:
|
||||
json.dump(report, f, ensure_ascii=False, indent=2)
|
||||
print(f"Failed log: {failed_file}")
|
||||
print(f"\nTo retry failed records:")
|
||||
print(f' python submitter.py {failed_file} --url "{target_url}"')
|
||||
# 生成一个可直接重试的文件
|
||||
retry_file = input_path.with_name(f"{base_name}_retry_{timestamp}.json")
|
||||
retry_payload = {
|
||||
"submission": submission,
|
||||
"data": failed_records,
|
||||
}
|
||||
with open(retry_file, "w", encoding="utf-8") as f:
|
||||
json.dump(retry_payload, f, ensure_ascii=False, indent=2)
|
||||
print(f" or: python submitter.py {retry_file}")
|
||||
|
||||
if not success_records and not failed_records:
|
||||
print("\nNo records to submit.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
Reference in New Issue
Block a user