From 96ce9e2dacc26bf2c7775b4ed7384c9ce4874e72 Mon Sep 17 00:00:00 2001 From: insleker Date: Fri, 22 Aug 2025 11:38:57 +0800 Subject: [PATCH] feat: basic implement --- .gitignore | 8 +- README.md | 61 +++++++- docs/use_cases.md | 24 +++ pyproject.toml | 21 +++ src/gen_report/__init__.py | 289 +++++++++++++++++++++++++++++++++++++ 5 files changed, 400 insertions(+), 3 deletions(-) create mode 100644 docs/use_cases.md create mode 100644 pyproject.toml create mode 100644 src/gen_report/__init__.py diff --git a/.gitignore b/.gitignore index b7faf40..78048ea 100644 --- a/.gitignore +++ b/.gitignore @@ -85,7 +85,7 @@ ipython_config.py # pyenv # For a library or package, you might want to ignore these files since the code is # intended to run in multiple environments; otherwise, check them in: -# .python-version +.python-version # pipenv # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. @@ -98,7 +98,7 @@ ipython_config.py # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control. # This is especially recommended for binary packages to ensure reproducibility, and is more # commonly ignored for libraries. -#uv.lock +uv.lock # poetry # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. @@ -205,3 +205,7 @@ cython_debug/ marimo/_static/ marimo/_lsp/ __marimo__/ + +**/*pb2*.py + +department_report.md \ No newline at end of file diff --git a/README.md b/README.md index b4871a5..8b4692c 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,61 @@ # gen-report -CLI tool to consolidate weekly members reports into a single department weekly report using an LLM. + +CLI tool to consolidate individual weekly member reports (Word / Markdown) into a single department weekly report using an LLM (via `litellm`). It can optionally leverage historical example weeks (few-shot) to steer style and structure. + +## Features + +* Parse member report files named like: `{member_name}工作報告-YYYYMMDD.docx|doc|md`. +* Convert `.docx` to Markdown using `mammoth`. +* Optional examples folder with both historical member reports and an existing team report (file name contains `之工作報告`). These are packaged into few-shot examples. +* Builds a structured prompt and calls a configured model (default `gemini/gemini-1.5-flash`, changeable) through `litellm`. +* Outputs a consolidated Markdown report with standardized sections. + +## Environment + +Get your API key from the respective provider. + +Set an API key supported by `litellm` (Gemini preferred default): + +```powershell +setx GOOGLE_API_KEY "your_gemini_key" # Windows PowerShell (Gemini) +``` + +You can also use: `GEMINI_API_KEY`, `OPENAI_API_KEY`, `ANTHROPIC_API_KEY`, `AZURE_OPENAI_API_KEY`, `GROQ_API_KEY`. + +## Usage + +```bash +gen-report --source ./this_week -e ./history1 -e ./history2 --out dept_report_20240821.md \ + --model gemini/gemini-1.5-flash +``` + +Dry run (print prompt only, no LLM call): + +```bash +gen-report --source ./this_week -e ./history1 -e ./history2 --dry-run +gen-report --source ./this_week -e ./history1 --dry-run +``` + +## Folder Layout Expectations + +``` +this_week/ + Alice工作報告-20240821.docx + Bob工作報告-20240821.md +history1/ + Alice工作報告-20240814.docx + Bob工作報告-20240814.docx + 114年08月7日~ 114年08月13日之工作報告_20240814.docx +history2/ + Alice工作報告-20240814.docx + Bob工作報告-20240814.docx + 114年04月24日~ 114年04月30日之工作報告_20240814.docx +``` + +## Development + +Run locally without install: + +```bash +python -m gen_report --source ./this_week --dry-run +``` diff --git a/docs/use_cases.md b/docs/use_cases.md new file mode 100644 index 0000000..0ffc119 --- /dev/null +++ b/docs/use_cases.md @@ -0,0 +1,24 @@ +# use cases + +* This program summarizes members' weekly reports into a single department weekly report. +* User assign source folder which contains all the individual reports in `docx` or `markdown` for processing. + * program will parse those reports and generate a summary report in markdown format. +* User should assigns additional example folder which contains all the individual reports at specific week, and relevant department report at that week. + +report document naming format + +member report naming, `{member_name}工作報告-{yyyyMMdd}.docx`, `{member_name}工作報告-{yyyyMMdd}.doc`, `{member_name}工作報告-{yyyyMMdd}.md`. + +team report naming, file name contain `{}~ {}之工作報告_{}`. + +## stage: preprocessing + +* program will read example folder and convert all docx file to markdown file copy if any. + * then program will give input members' reports and output department report (all in markdown) at that week into LLM as example context. + +## stage: summarized target report + +* program will convert all markdown files in src folder to plain markdown files. +* program will input all plain markdown files into LLM as input, and LLM has to generate a final department report based on the input (just as example context). + + diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..ea4d837 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,21 @@ +[project] +name = "gen-report" +version = "0.1.0" +description = "Add your description here" +readme = "README.md" +authors = [ + { name = "insleker", email = "bkinnightskytw@gmail.com" } +] +requires-python = ">=3.10" +dependencies = [ + "litellm>=1.75.9", + "mammoth>=1.10.0", + "python-dotenv>=1.1.1", +] + +[project.scripts] +gen-report = "gen_report:main" + +[build-system] +requires = ["uv_build>=0.8.12,<0.9.0"] +build-backend = "uv_build" diff --git a/src/gen_report/__init__.py b/src/gen_report/__init__.py new file mode 100644 index 0000000..b3f98cb --- /dev/null +++ b/src/gen_report/__init__.py @@ -0,0 +1,289 @@ +from __future__ import annotations + +import argparse +import asyncio +import os +import traceback +import re +from dataclasses import dataclass +from pathlib import Path +from typing import Iterable, List, Optional, Any, cast, Union, Dict +from dotenv import load_dotenv + +from litellm import acompletion + +import mammoth # type: ignore + + +MEMBER_REPORT_REGEX = re.compile(r"^(?P.+?)工作報告-(?P\d{8})\.(?Pdocx|doc|md)$") +TEAM_REPORT_HINT = "之工作報告" + + +@dataclass +class ExampleWeek: + date: str + member_reports: List[str] + team_report: str + + +def convert_docx_to_markdown(path: Path) -> str: + """Convert a .docx file to markdown text. + + If mammoth is unavailable or conversion fails, returns an empty string. + """ + if mammoth is None: + return "" + try: + with path.open("rb") as f: + result = mammoth.convert_to_markdown(f) + return result.value.strip() + except Exception: + return "" + + +def load_markdown_from_file(path: Path) -> str: + ext = path.suffix.lower() + if ext == ".md": + try: + return path.read_text(encoding="utf-8").strip() + except Exception: + return "" + if ext in {".docx", ".doc"}: + return convert_docx_to_markdown(path) + return "" + + +def iter_member_reports(folder: Path) -> Iterable[tuple[str, str, Path]]: + for p in folder.glob("*"): + if not p.is_file(): + continue + m = MEMBER_REPORT_REGEX.match(p.name) + if not m: + continue + name = m.group("name") + date = m.group("date") + yield name, date, p + + +def collect_examples(examples_dir: Optional[Path]) -> List[ExampleWeek]: + if not examples_dir or not examples_dir.exists(): + return [] + # Group by date extracted from member reports; then look for a team report file containing hint & date range. + by_date: dict[str, List[str]] = {} + for name, date, path in iter_member_reports(examples_dir): + text = load_markdown_from_file(path) + if not text: + continue + by_date.setdefault(date, []).append(f"# {name}\n\n{text}") + + weeks: List[ExampleWeek] = [] + # Find potential team reports + team_candidates = [ + p for p in examples_dir.glob("*") if p.is_file() and TEAM_REPORT_HINT in p.name + ] + for date, member_chunks in by_date.items(): + team_text = "" + for cand in team_candidates: + # Heuristic: if date substring appears in file name + if date in cand.name: + team_text = load_markdown_from_file(cand) + break + if not team_text and team_candidates: + # fallback first candidate + team_text = load_markdown_from_file(team_candidates[0]) + if team_text: + weeks.append( + ExampleWeek(date=date, member_reports=member_chunks, team_report=team_text) + ) + return weeks + + +def collect_examples_from_dirs(example_dirs: List[Path]) -> List[ExampleWeek]: + """Collect and merge example weeks from multiple directories. + + If the same date appears across directories, member reports are concatenated + and the first non-empty team report encountered is used. + """ + if not example_dirs: + return [] + merged: Dict[str, ExampleWeek] = {} + for d in example_dirs: + for w in collect_examples(d): + if w.date not in merged: + merged[w.date] = ExampleWeek( + date=w.date, + member_reports=list(w.member_reports), + team_report=w.team_report, + ) + else: + existing = merged[w.date] + # Append member reports, avoid exact duplicates + seen = set(existing.member_reports) + for mr in w.member_reports: + if mr not in seen: + existing.member_reports.append(mr) + seen.add(mr) + if not existing.team_report and w.team_report: + existing.team_report = w.team_report + # Return as list + return list(merged.values()) + + +def build_few_shot_examples(weeks: List[ExampleWeek]) -> str: + if not weeks: + return "" + # Use most recent dates (sorted descending) + weeks_sorted = sorted(weeks, key=lambda w: w.date, reverse=True) + blocks = [] + for w in weeks_sorted: + blocks.append( + f"\n\n{chr(10).join(w.member_reports)}\n\n\n{w.team_report}\n\n" + ) + return "\n\n".join(blocks) + + +def build_prompt(member_markdowns: List[str], examples_block: str) -> str: + intro = ( + "You are an assistant that aggregates individual weekly engineering reports into a concise, well-structured department report. " + "Summarize achievements, ongoing work, issues/risks, metrics, and next week plan. Keep factual, merge duplicates, and preserve important numbers.\n" + ) + instructions = ( + "Format sections based on projects' name." + "Plz try to keep projects consistent between examples and new report." + "'BONY觸控一體機','得鑫螺絲','得鑫螺絲HMI','EBONY觸控一體機'... 都屬於'螺絲案'的一部份。" + "'Resymot', 'Resymot GUI', 'iMotion-XYZ控制器'... 都屬於'iMotion-3dof'的一部份。" + "'育成計畫', '新人訓練'... 都屬於'教育訓練'的一部份。" + "Use bullet points; group similar items." + ) + input_block = "\n\n".join( + f"\n{txt}\n" for i, txt in enumerate(member_markdowns, 1) + ) + prompt = ( + f"{intro}{instructions}\n" + + (f"\nFEW-SHOT EXAMPLES:\n{examples_block}\n" if examples_block else "") + + f"\nTARGET INPUT:\n{input_block}\n\nGenerate the consolidated department weekly report in markdown now." + ) + return prompt + + +async def call_llm(model: str, prompt: str, max_tokens: int = 2000) -> Union[str, None]: + try: + resp: Any = await acompletion( + model=model, + messages=[{"role": "user", "content": prompt}], + max_tokens=max_tokens, + ) + # Support both object-style and dict-style responses + choices: Any = getattr(resp, "choices", None) + if choices is None and isinstance(resp, dict): + choices = resp.get("choices") + if not choices: + return None + first = choices[0] + message: Any = getattr(first, "message", None) + if message is None and isinstance(first, dict): + message = first.get("message") + if not message: + return None + content: Optional[str] = getattr(message, "content", None) + if content is None and isinstance(message, dict): + content = cast(Optional[str], message.get("content")) + return content + except Exception: + print(f"error occurred: {traceback.format_exc()}") + return None + + +def parse_args(argv: Optional[List[str]] = None) -> argparse.Namespace: + p = argparse.ArgumentParser( + description="Generate consolidated weekly department report from individual reports" + ) + p.add_argument( + "--source", required=True, help="Folder containing this week's member reports (docx/md)" + ) + p.add_argument( + "--examples", + "-e", + action="append", + default=[], + help="Folder containing historical example weeks (optional). May be provided multiple times.", + ) + p.add_argument( + "--model", + default="gemini/gemini-1.5-flash", + help="LLM model name for litellm (default: gemini/gemini-1.5-flash)", + ) + p.add_argument("--out", default="department_report.md", help="Output markdown file path") + p.add_argument("--max-tokens", type=int, default=2000, help="Max tokens for generation") + p.add_argument( + "--dry-run", action="store_true", help="Only build and print prompt (no LLM call)" + ) + return p.parse_args(argv) + + +def gather_member_markdowns(source_dir: Path) -> List[str]: + chunks: List[str] = [] + for name, date, path in iter_member_reports(source_dir): + text = load_markdown_from_file(path) + if not text: + continue + chunks.append(f"# {name}\n\n{text}") + return chunks + + +def ensure_api_key_present() -> None: + # litellm supports many providers; we only check a few common env vars. + if any( + os.getenv(k) + for k in [ + "GOOGLE_API_KEY", # Gemini + "GEMINI_API_KEY", # alternate naming if user sets + "OPENAI_API_KEY", + "ANTHROPIC_API_KEY", + "AZURE_OPENAI_API_KEY", + "GROQ_API_KEY", + ] + ): + return + raise RuntimeError( + "No provider API key env var found (e.g., GOOGLE_API_KEY for Gemini). Set one before running." + ) + + +def main(argv: Optional[List[str]] = None) -> None: + load_dotenv() + args = parse_args(argv) + source_dir = Path(args.source) + if not source_dir.exists(): + raise SystemExit(f"Source folder not found: {source_dir}") + examples_dirs = [Path(p) for p in (args.examples or [])] + + member_markdowns = gather_member_markdowns(source_dir) + if not member_markdowns: + raise SystemExit("No valid member reports found in source folder.") + examples = collect_examples_from_dirs(examples_dirs) + examples_block = build_few_shot_examples(examples) + prompt = build_prompt(member_markdowns, examples_block) + + if args.dry_run: + print(prompt) + return + + ensure_api_key_present() + + async def _run(): + report_md = await call_llm(args.model, prompt, max_tokens=args.max_tokens) + out_path = Path(args.out) + if report_md: + out_path.write_text(report_md, encoding="utf-8") + print(f"Report written to {out_path}") + + asyncio.run(_run()) + + +__all__ = [ + "main", + "build_prompt", + "collect_examples", + "gather_member_markdowns", +]