From 96ce9e2dacc26bf2c7775b4ed7384c9ce4874e72 Mon Sep 17 00:00:00 2001
From: insleker <bkinnightskytw@gmail.com>
Date: Fri, 22 Aug 2025 11:38:57 +0800
Subject: [PATCH] feat: basic implement

---
 .gitignore                 |   8 +-
 README.md                  |  61 +++++++-
 docs/use_cases.md          |  24 +++
 pyproject.toml             |  21 +++
 src/gen_report/__init__.py | 289 +++++++++++++++++++++++++++++++++++++
 5 files changed, 400 insertions(+), 3 deletions(-)
 create mode 100644 docs/use_cases.md
 create mode 100644 pyproject.toml
 create mode 100644 src/gen_report/__init__.py

diff --git a/.gitignore b/.gitignore
index b7faf40..78048ea 100644
--- a/.gitignore
+++ b/.gitignore
@@ -85,7 +85,7 @@ ipython_config.py
 # pyenv
 #   For a library or package, you might want to ignore these files since the code is
 #   intended to run in multiple environments; otherwise, check them in:
-# .python-version
+.python-version
 
 # pipenv
 #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
@@ -98,7 +98,7 @@ ipython_config.py
 #   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
 #   This is especially recommended for binary packages to ensure reproducibility, and is more
 #   commonly ignored for libraries.
-#uv.lock
+uv.lock
 
 # poetry
 #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
@@ -205,3 +205,7 @@ cython_debug/
 marimo/_static/
 marimo/_lsp/
 __marimo__/
+
+**/*pb2*.py
+
+department_report.md
\ No newline at end of file
diff --git a/README.md b/README.md
index b4871a5..8b4692c 100644
--- a/README.md
+++ b/README.md
@@ -1,2 +1,61 @@
 # gen-report
-CLI tool to consolidate weekly members reports into a single department weekly report using an LLM.
+
+CLI tool to consolidate individual weekly member reports (Word / Markdown) into a single department weekly report using an LLM (via `litellm`). It can optionally leverage historical example weeks (few-shot) to steer style and structure.
+
+## Features
+
+* Parse member report files named like: `{member_name}工作報告-YYYYMMDD.docx|doc|md`.
+* Convert `.docx` to Markdown using `mammoth`.
+* Optional examples folder with both historical member reports and an existing team report (file name contains `之工作報告`). These are packaged into few-shot examples.
+* Builds a structured prompt and calls a configured model (default `gemini/gemini-1.5-flash`, changeable) through `litellm`.
+* Outputs a consolidated Markdown report with standardized sections.
+
+## Environment
+
+Get your API key from the respective provider.
+
+Set an API key supported by `litellm` (Gemini preferred default):
+
+```powershell
+setx GOOGLE_API_KEY "your_gemini_key"  # Windows PowerShell (Gemini)
+```
+
+You can also use: `GEMINI_API_KEY`, `OPENAI_API_KEY`, `ANTHROPIC_API_KEY`, `AZURE_OPENAI_API_KEY`, `GROQ_API_KEY`.
+
+## Usage
+
+```bash
+gen-report --source ./this_week -e ./history1 -e ./history2 --out dept_report_20240821.md \
+	--model gemini/gemini-1.5-flash
+```
+
+Dry run (print prompt only, no LLM call):
+
+```bash
+gen-report --source ./this_week -e ./history1 -e ./history2 --dry-run
+gen-report --source ./this_week -e ./history1 --dry-run
+```
+
+## Folder Layout Expectations
+
+```
+this_week/
+	Alice工作報告-20240821.docx
+	Bob工作報告-20240821.md
+history1/
+	Alice工作報告-20240814.docx
+	Bob工作報告-20240814.docx
+	114年08月7日~ 114年08月13日之工作報告_20240814.docx
+history2/
+	Alice工作報告-20240814.docx
+	Bob工作報告-20240814.docx
+	114年04月24日~ 114年04月30日之工作報告_20240814.docx
+```
+
+## Development
+
+Run locally without install:
+
+```bash
+python -m gen_report --source ./this_week --dry-run
+```
diff --git a/docs/use_cases.md b/docs/use_cases.md
new file mode 100644
index 0000000..0ffc119
--- /dev/null
+++ b/docs/use_cases.md
@@ -0,0 +1,24 @@
+# use cases
+
+* This program summarizes members' weekly reports into a single department weekly report.
+* User assign source folder which contains all the individual reports in `docx` or `markdown` for processing.
+  * program will parse those reports and generate a summary report in markdown format.
+* User should assigns additional example folder which contains all the individual reports at specific week, and relevant department report at that week.
+
+report document naming format
+
+member report naming, `{member_name}工作報告-{yyyyMMdd}.docx`, `{member_name}工作報告-{yyyyMMdd}.doc`, `{member_name}工作報告-{yyyyMMdd}.md`.
+
+team report naming, file name contain `{}~ {}之工作報告_{}`.
+
+## stage: preprocessing
+
+* program will read example folder and convert all docx file to markdown file copy if any.
+  * then program will give input members' reports and output department report (all in markdown) at that week into LLM as example context.
+
+## stage: summarized target report
+
+* program will convert all markdown files in src folder to plain markdown files.
+* program will input all plain markdown files into LLM as input, and LLM has to generate a final department report based on the input (just as example context).
+
+
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..ea4d837
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,21 @@
+[project]
+name = "gen-report"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+authors = [
+    { name = "insleker", email = "bkinnightskytw@gmail.com" }
+]
+requires-python = ">=3.10"
+dependencies = [
+    "litellm>=1.75.9",
+    "mammoth>=1.10.0",
+    "python-dotenv>=1.1.1",
+]
+
+[project.scripts]
+gen-report = "gen_report:main"
+
+[build-system]
+requires = ["uv_build>=0.8.12,<0.9.0"]
+build-backend = "uv_build"
diff --git a/src/gen_report/__init__.py b/src/gen_report/__init__.py
new file mode 100644
index 0000000..b3f98cb
--- /dev/null
+++ b/src/gen_report/__init__.py
@@ -0,0 +1,289 @@
+from __future__ import annotations
+
+import argparse
+import asyncio
+import os
+import traceback
+import re
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Iterable, List, Optional, Any, cast, Union, Dict
+from dotenv import load_dotenv
+
+from litellm import acompletion
+
+import mammoth  # type: ignore
+
+
+MEMBER_REPORT_REGEX = re.compile(r"^(?P<name>.+?)工作報告-(?P<date>\d{8})\.(?P<ext>docx|doc|md)$")
+TEAM_REPORT_HINT = "之工作報告"
+
+
+@dataclass
+class ExampleWeek:
+    date: str
+    member_reports: List[str]
+    team_report: str
+
+
+def convert_docx_to_markdown(path: Path) -> str:
+    """Convert a .docx file to markdown text.
+
+    If mammoth is unavailable or conversion fails, returns an empty string.
+    """
+    if mammoth is None:
+        return ""
+    try:
+        with path.open("rb") as f:
+            result = mammoth.convert_to_markdown(f)
+        return result.value.strip()
+    except Exception:
+        return ""
+
+
+def load_markdown_from_file(path: Path) -> str:
+    ext = path.suffix.lower()
+    if ext == ".md":
+        try:
+            return path.read_text(encoding="utf-8").strip()
+        except Exception:
+            return ""
+    if ext in {".docx", ".doc"}:
+        return convert_docx_to_markdown(path)
+    return ""
+
+
+def iter_member_reports(folder: Path) -> Iterable[tuple[str, str, Path]]:
+    for p in folder.glob("*"):
+        if not p.is_file():
+            continue
+        m = MEMBER_REPORT_REGEX.match(p.name)
+        if not m:
+            continue
+        name = m.group("name")
+        date = m.group("date")
+        yield name, date, p
+
+
+def collect_examples(examples_dir: Optional[Path]) -> List[ExampleWeek]:
+    if not examples_dir or not examples_dir.exists():
+        return []
+    # Group by date extracted from member reports; then look for a team report file containing hint & date range.
+    by_date: dict[str, List[str]] = {}
+    for name, date, path in iter_member_reports(examples_dir):
+        text = load_markdown_from_file(path)
+        if not text:
+            continue
+        by_date.setdefault(date, []).append(f"# {name}\n\n{text}")
+
+    weeks: List[ExampleWeek] = []
+    # Find potential team reports
+    team_candidates = [
+        p for p in examples_dir.glob("*") if p.is_file() and TEAM_REPORT_HINT in p.name
+    ]
+    for date, member_chunks in by_date.items():
+        team_text = ""
+        for cand in team_candidates:
+            # Heuristic: if date substring appears in file name
+            if date in cand.name:
+                team_text = load_markdown_from_file(cand)
+                break
+        if not team_text and team_candidates:
+            # fallback first candidate
+            team_text = load_markdown_from_file(team_candidates[0])
+        if team_text:
+            weeks.append(
+                ExampleWeek(date=date, member_reports=member_chunks, team_report=team_text)
+            )
+    return weeks
+
+
+def collect_examples_from_dirs(example_dirs: List[Path]) -> List[ExampleWeek]:
+    """Collect and merge example weeks from multiple directories.
+
+    If the same date appears across directories, member reports are concatenated
+    and the first non-empty team report encountered is used.
+    """
+    if not example_dirs:
+        return []
+    merged: Dict[str, ExampleWeek] = {}
+    for d in example_dirs:
+        for w in collect_examples(d):
+            if w.date not in merged:
+                merged[w.date] = ExampleWeek(
+                    date=w.date,
+                    member_reports=list(w.member_reports),
+                    team_report=w.team_report,
+                )
+            else:
+                existing = merged[w.date]
+                # Append member reports, avoid exact duplicates
+                seen = set(existing.member_reports)
+                for mr in w.member_reports:
+                    if mr not in seen:
+                        existing.member_reports.append(mr)
+                        seen.add(mr)
+                if not existing.team_report and w.team_report:
+                    existing.team_report = w.team_report
+    # Return as list
+    return list(merged.values())
+
+
+def build_few_shot_examples(weeks: List[ExampleWeek]) -> str:
+    if not weeks:
+        return ""
+    # Use most recent dates (sorted descending)
+    weeks_sorted = sorted(weeks, key=lambda w: w.date, reverse=True)
+    blocks = []
+    for w in weeks_sorted:
+        blocks.append(
+            f"<EXAMPLE_WEEK date={w.date}>\n<INPUT>\n{chr(10).join(w.member_reports)}\n</INPUT>\n<OUTPUT>\n{w.team_report}\n</OUTPUT>\n</EXAMPLE_WEEK>"
+        )
+    return "\n\n".join(blocks)
+
+
+def build_prompt(member_markdowns: List[str], examples_block: str) -> str:
+    intro = (
+        "You are an assistant that aggregates individual weekly engineering reports into a concise, well-structured department report. "
+        "Summarize achievements, ongoing work, issues/risks, metrics, and next week plan. Keep factual, merge duplicates, and preserve important numbers.\n"
+    )
+    instructions = (
+        "Format sections based on projects' name."
+        "Plz try to keep projects consistent between examples and new report."
+		"'BONY觸控一體機','得鑫螺絲','得鑫螺絲HMI','EBONY觸控一體機'... 都屬於'螺絲案'的一部份。"
+		"'Resymot', 'Resymot GUI', 'iMotion-XYZ控制器'... 都屬於'iMotion-3dof'的一部份。"
+		"'育成計畫', '新人訓練'... 都屬於'教育訓練'的一部份。"
+        "Use bullet points; group similar items."
+    )
+    input_block = "\n\n".join(
+        f"<REPORT index={i}>\n{txt}\n</REPORT>" for i, txt in enumerate(member_markdowns, 1)
+    )
+    prompt = (
+        f"{intro}{instructions}\n"
+        + (f"\nFEW-SHOT EXAMPLES:\n{examples_block}\n" if examples_block else "")
+        + f"\nTARGET INPUT:\n{input_block}\n\nGenerate the consolidated department weekly report in markdown now."
+    )
+    return prompt
+
+
+async def call_llm(model: str, prompt: str, max_tokens: int = 2000) -> Union[str, None]:
+    try:
+        resp: Any = await acompletion(
+            model=model,
+            messages=[{"role": "user", "content": prompt}],
+            max_tokens=max_tokens,
+        )
+        # Support both object-style and dict-style responses
+        choices: Any = getattr(resp, "choices", None)
+        if choices is None and isinstance(resp, dict):
+            choices = resp.get("choices")
+        if not choices:
+            return None
+        first = choices[0]
+        message: Any = getattr(first, "message", None)
+        if message is None and isinstance(first, dict):
+            message = first.get("message")
+        if not message:
+            return None
+        content: Optional[str] = getattr(message, "content", None)
+        if content is None and isinstance(message, dict):
+            content = cast(Optional[str], message.get("content"))
+        return content
+    except Exception:
+        print(f"error occurred: {traceback.format_exc()}")
+        return None
+
+
+def parse_args(argv: Optional[List[str]] = None) -> argparse.Namespace:
+    p = argparse.ArgumentParser(
+        description="Generate consolidated weekly department report from individual reports"
+    )
+    p.add_argument(
+        "--source", required=True, help="Folder containing this week's member reports (docx/md)"
+    )
+    p.add_argument(
+        "--examples",
+        "-e",
+        action="append",
+        default=[],
+        help="Folder containing historical example weeks (optional). May be provided multiple times.",
+    )
+    p.add_argument(
+        "--model",
+        default="gemini/gemini-1.5-flash",
+        help="LLM model name for litellm (default: gemini/gemini-1.5-flash)",
+    )
+    p.add_argument("--out", default="department_report.md", help="Output markdown file path")
+    p.add_argument("--max-tokens", type=int, default=2000, help="Max tokens for generation")
+    p.add_argument(
+        "--dry-run", action="store_true", help="Only build and print prompt (no LLM call)"
+    )
+    return p.parse_args(argv)
+
+
+def gather_member_markdowns(source_dir: Path) -> List[str]:
+    chunks: List[str] = []
+    for name, date, path in iter_member_reports(source_dir):
+        text = load_markdown_from_file(path)
+        if not text:
+            continue
+        chunks.append(f"# {name}\n\n{text}")
+    return chunks
+
+
+def ensure_api_key_present() -> None:
+    # litellm supports many providers; we only check a few common env vars.
+    if any(
+        os.getenv(k)
+        for k in [
+            "GOOGLE_API_KEY",  # Gemini
+            "GEMINI_API_KEY",  # alternate naming if user sets
+            "OPENAI_API_KEY",
+            "ANTHROPIC_API_KEY",
+            "AZURE_OPENAI_API_KEY",
+            "GROQ_API_KEY",
+        ]
+    ):
+        return
+    raise RuntimeError(
+        "No provider API key env var found (e.g., GOOGLE_API_KEY for Gemini). Set one before running."
+    )
+
+
+def main(argv: Optional[List[str]] = None) -> None:
+    load_dotenv()
+    args = parse_args(argv)
+    source_dir = Path(args.source)
+    if not source_dir.exists():
+        raise SystemExit(f"Source folder not found: {source_dir}")
+    examples_dirs = [Path(p) for p in (args.examples or [])]
+
+    member_markdowns = gather_member_markdowns(source_dir)
+    if not member_markdowns:
+        raise SystemExit("No valid member reports found in source folder.")
+    examples = collect_examples_from_dirs(examples_dirs)
+    examples_block = build_few_shot_examples(examples)
+    prompt = build_prompt(member_markdowns, examples_block)
+
+    if args.dry_run:
+        print(prompt)
+        return
+
+    ensure_api_key_present()
+
+    async def _run():
+        report_md = await call_llm(args.model, prompt, max_tokens=args.max_tokens)
+        out_path = Path(args.out)
+        if report_md:
+            out_path.write_text(report_md, encoding="utf-8")
+        print(f"Report written to {out_path}")
+
+    asyncio.run(_run())
+
+
+__all__ = [
+    "main",
+    "build_prompt",
+    "collect_examples",
+    "gather_member_markdowns",
+]