diff --git a/.gitignore b/.gitignore index d5f4aeb..0a19037 100644 --- a/.gitignore +++ b/.gitignore @@ -76,3 +76,7 @@ deepseek_venv/ dependency_setup/.venvs/ .venv_docling/ deepseek-ocr/DeepSeek-OCR-empty/ +.venv +# Local DeepSeek checkout and repro scripts (keep out of master) +deepseek-ocr/ +repro_rapidocr_onnx/ diff --git a/deepseek-ocr/NOTES.md b/deepseek-ocr/NOTES.md deleted file mode 100644 index e90d98e..0000000 --- a/deepseek-ocr/NOTES.md +++ /dev/null @@ -1,60 +0,0 @@ -# DeepSeek-OCR vLLM Runner Notes - -## Repo snapshot (2025-10-26 export) -- Runner scripts: `run_pdf_ocr_vllm.py` (high-throughput vLLM path) and `run_pdf_ocr.py` (single-process baseline). -- Model scaffold: `DeepSeek-OCR-empty/` mirrors the checkpoint layout; download `deepseek-ai/DeepSeek-OCR` and unpack it here so `model-00001-of-000001.safetensors` replaces the 135-byte placeholder. -- AWS ops files live under `aws/` (bootstrap script, deployment checklist, multi-GPU plan, sample PDF manifest). -- Notes (this file) carry the operational history, tuning tips, and install timings. Update here if you change dependencies or infra recipes. - -## Quickstart (fresh clone) -1. Fetch weights: `huggingface-cli download deepseek-ai/DeepSeek-OCR --local-dir DeepSeek-OCR --local-dir-use-symlinks False`. (Leave `DeepSeek-OCR-empty/` in place or remove it after you confirm the real weights are present.) -2. Create a Python 3.12 environment (`conda create -n deepseek-ocr python=3.12` or reuse the AWS bootstrap script). Activate it and run: - ```bash - pip install --pre --index-url https://download.pytorch.org/whl/nightly/cu124 torch torchvision torchaudio - pip install --pre --extra-index-url https://wheels.vllm.ai/nightly vllm - pip install PyMuPDF pillow numpy rich tqdm huggingface-hub safetensors "transformers>=4.55" - pip install 'huggingface_hub[hf_transfer]' - ``` -3. (Optional) Build `libjpeg-turbo` 3.0.3 + NASM 2.16 if you need Pillow-SIMD speedups; export `LD_LIBRARY_PATH=/path/to/libjpeg-turbo/lib:$LD_LIBRARY_PATH` before invoking the runners. -4. For multi-GPU AWS runs, copy `aws/setup_multi_gpu_env.sh` to the instance, run with sudo, then follow the checklist (`aws/deployment_checklist.md`). Sample PDFs and metadata are listed in `aws/sample_pdfs/greek_pdf_samples.json`. -5. Run a smoke test (replace `/path/to/pdfs` with a directory containing the sample PDFs from the manifest): - ```bash - python run_pdf_ocr_vllm.py --input-dir /path/to/pdfs --output-dir ./outputs --mode clean --max-pages 2 - ``` - -## Environment quirks -- Pillow-SIMD needs system JPEG headers. Build `libjpeg-turbo` 3.0.3 from source (with NASM 2.16.01) and export `LD_LIBRARY_PATH=/path/to/libjpeg-turbo/lib:$LD_LIBRARY_PATH` before running anything that touches Pillow. -- vLLM nightly currently requires the CUDA 12.9 stack (`torch/torchvision/torchaudio==2.9.0`). Leaving the checklist item for CUDA 11.8 wheels unchecked until an upstream build supports them. -- `flash-attn==2.7.3` installs cleanly once the CUDA 12.9 toolchain is active; keep the `--no-build-isolation` flag and reuse the custom `LD_LIBRARY_PATH`. -- Latest flash-attn build (SSM command `a3f3845c-1e2e-405c-a90a-1ff096a09672`) ran `pip install flash-attn==2.7.4.post1 --no-build-isolation` via `AWS-RunShellScript` on `deepseek-ocr-g6` and completed in roughly 43 minutes, so plan on ~45 minutes for future compiles. - -## Runner behaviour -- `run_pdf_ocr_vllm.py` now handles PDF rendering, blank-page detection, batching, FP8 KV cache defaults, and logs per-run throughput. Check `/tmp/run_vllm_full.log` (or custom log path) after runs for detailed timing. -- After clearing stale GPU processes and enabling blank-page short-circuiting, the latest smoke run (JSM_564 + PXJ_747, 5 pages) finished in 12.6 s (~0.79 pages/s; see `tmp_pdf_runs/test_run3.log`). Use this as the healthy baseline for short bursts; a full corpus pass on 2025-10-25 processed 724 pages in 1661 s (~0.44 pages/s, log `tmp_pdf_runs/run_bf16_full_20251025_152925.log`). -- Increase `--batch-pages` or enable `--enable-fp8-weights` to explore higher throughput once quality is validated; keep `--gpu-memory-utilization` near 0.95 for the L4. -- Combined Markdown files live directly under the output directory, named `.md` (e.g. `AAB_001.md`). Page-level `.md` dumps are no longer written; optional assets (page PNGs or ROI crops) are stored under `_assets/`. -- Multi-GPU prep: `--num-shards N` and `--shard-index i` fan out PDF batches across workers/GPUs; combine with per-worker `CUDA_VISIBLE_DEVICES`, `--tensor-parallel-size` (if sharing a process across multiple devices), and optional `--mm-encoder-tp-mode data` to exercise vLLM’s multimodal encoder sharding. -- All dev smoke runs now wrap the CLI in `timeout 300 …` so we fail fast if decoding stalls. - -## Implemented enhancements -- Split prompts/modes: grounded runs use `<|grounding|>` and keep `<|ref|>/<|det|>` blocks, while clean runs strip meta tokens and emit Markdown-only text. -- Added tokenizer whitelist for `//) and keep Greek diacritics." -) -DEFAULT_RETRY_LABELS = ("table",) -BASE_VISION_CONFIG = (1024, 640, True) -LARGE_VISION_CONFIG = (1280, 1280, False) -CHECKPOINT_DIR = Path(__file__).resolve().parent / "DeepSeek-OCR" - -FULL_WIDTH_BAR = "\uFF5C" -FULL_WIDTH_LT = "\uFF1C" -ORPHAN_META_FRAGMENT_PATTERN = re.compile( - rf"<[|{FULL_WIDTH_BAR}][^>]{0,64}[|{FULL_WIDTH_BAR}]>", re.DOTALL -) -LEFTOVER_META_PATTERN = re.compile(rf"(?:<|{FULL_WIDTH_LT})(?:\||{FULL_WIDTH_BAR})") -REFDET_BLOCK_PATTERN = re.compile( - r"<\|ref\|>.*?<\|/ref\|><\|det\|>.*?<\|/det\|>", re.DOTALL | re.IGNORECASE -) -REFDET_EXTRACT_PATTERN = re.compile( - r"(<\|ref\|>(.*?)<\|/ref\|><\|det\|>(.*?)<\|/det\|>)", re.DOTALL | re.IGNORECASE -) -BOUNDING_BOX_PREFIX_PATTERN = re.compile(r"^\s*[A-Za-z_]+\[\[.*?\]\]\s*") -PLACEHOLDER_CELL_PATTERN = re.compile( - r"(]*>)(.*?)()", re.IGNORECASE | re.DOTALL -) -EMPTY_CELL_PATTERN = re.compile(r"]*>\s*", re.IGNORECASE | re.DOTALL) -TABLE_BLOCK_PATTERN = re.compile( - r"]*>.*?
/` tags so the DeepSeek n-gram guard no longer suppresses short HTML tokens. -- Upgraded `clean_output` to remove prompt echoes, `` tokens, image captions, redundant bounding-box prefixes, spammy LaTeX/TikZ blocks, and tokenizer-declared special/meta tokens (tracked via a dynamic regex cache). -- Placeholder-safe tables: regex post-processing strips `None`/`N/A`/dash filler cells, collapses whitespace-only ``s, and drops fully empty tables. (Logit bias guardrails are temporarily disabled as of 2025-10-26.) -- Combined outputs per PDF (`combined.md` / `combined_grounded.md`) with `----- Page N -----` headings starting from page 2 so Markdown previews render correctly. -- Canonical markdown pass trims multi-blank lines, de-hyphenates wrapped words, drops empty tables, and remaps superscript citations (`71`) to GitHub footnotes (`[^71]`). -- Fast blank-page detector (`is_mostly_blank_pix`) skips inference on empty renders and writes a `[[Blank page]]` stub, preventing hallucinated content on blank scans. -- Optional ROI second pass: grounded runs can crop table/title/paragraph/figure regions and re-infer them with the clean prompt for higher fidelity. -- Optional Large-mode retry: pages missing required labels (default `table`) re-run in the 1280 “Large” vision mode to recover objects missed by the standard tiler; the Large LLM is now reused across PDFs and capped via `--retry-large-cap` to avoid runaway retries. -- Per-page logging now records token counts/preview text and appends lightweight debug stats to `/tmp/debug.txt` during development. -- Decoding guardrails: sampler uses a shared stop list for `<|...|>` fragments and logs a post-run sanity summary (`placeholder_cells_pruned=…`, `tables_dropped=…`, etc.); placeholder logit bias is currently disabled pending a safer design. -- Supports BF16 baseline with FP8 KV cache by default, plus an `--enable-fp8-weights` switch for experimentation (quality needs manual review). -- Sharding helpers keep runner scripts multi-GPU ready without orchestration: each worker sets `CUDA_VISIBLE_DEVICES`, passes its shard index/count, and can increase `--tensor-parallel-size` when sharing an engine across local GPUs. - -## Follow-ups -- Replace the temporary `LD_LIBRARY_PATH` step with a wrapper script or env file to avoid manual exports. -- Revisit the CUDA 11.8 wheel requirement when a matching vLLM build becomes available; that will let us tick the outstanding checklist item. -- Validate the multi-GPU sharding flow on a 4× L4 block (one worker per GPU) and capture throughput deltas vs. single-device runs. -- Rework prompt-level placeholder instructions/logit bias; both were reverted on 2025-10-26 after causing “Here is the text…” hallucinations. Current runs use the minimal base prompt with no placeholder bias. -- Document 2025-10-26 AWS bootstrap adjustments: instance now uses IAM role `deepseek-ocr-ssm-role` + instance profile for SSM, CloudWatch, and S3 logging so long installs (e.g., flash-attn builds) can run via `aws ssm send-command`. -- FlashAttention build remains a pain point: no prebuilt wheel for Python 3.12 + CUDA 12.4, so `pip install flash-attn==2.7.4.post1 --no-build-isolation` compiles from source (~45–60 min). SSM command `a3f3845c-…` kicked off the build; monitor with `aws ssm get-command-invocation …` or CloudWatch `/deepseek-ocr/ssm` once the stream appears. diff --git a/deepseek-ocr/README.md b/deepseek-ocr/README.md deleted file mode 100644 index ae09a9a..0000000 --- a/deepseek-ocr/README.md +++ /dev/null @@ -1,11 +0,0 @@ -# DeepSeek-OCR Runner Snapshot - -This folder is a trimmed export of the DeepSeek-OCR automation work so it can be pushed to a remote repo without the 6 GB checkpoint or local build artifacts. - -- `run_pdf_ocr_vllm.py` – high-throughput vLLM runner with batching, blank-page skips, ROI retries, and multi-GPU sharding flags. -- `run_pdf_ocr.py` – minimal single-GPU baseline that uses the model’s native `.infer` API. -- `DeepSeek-OCR-empty/` – directory structure the runners expect; download the real weights from Hugging Face and overwrite the placeholder files. -- `aws/` – environment bootstrap script, deployment checklist, scaling plan, and the 10-sample PDF manifest used for regression runs. -- `NOTES.md` – operational logbook with build timings, throughput baselines, tuning guidance, and a quickstart guide for reproducing the setup. - -For fresh environments, follow the quickstart section in `NOTES.md`, then run the vLLM runner on your PDF corpus. The AWS scripts assume a g6.12xlarge (4× L4) instance but can be adapted to other CUDA 12.4+ stacks with minor tweaks. diff --git a/deepseek-ocr/aws/multi_gpu_plan.md b/deepseek-ocr/aws/multi_gpu_plan.md deleted file mode 100644 index 6dbfa85..0000000 --- a/deepseek-ocr/aws/multi_gpu_plan.md +++ /dev/null @@ -1,91 +0,0 @@ -# Multi-GPU DeepSeek-OCR Deployment Plan (AWS g6 4×L4) - -## Objectives -- Process large PDF corpora with DeepSeek-OCR using our vLLM runner on 4× NVIDIA L4 GPUs. -- Replicate the current bf16 + FP8 KV configuration used locally, with optional FP8 weight toggles. -- Provide a reproducible environment bootstrap script and operational checklist before launching the instance. - -## Target AWS Resources -- **Region:** `us-east-1` (availability of g6 family and proximity to S3/HF endpoints). -- **Instance type:** `g6.12xlarge` → 4× NVIDIA L4 (24 GB each), 24 vCPUs, 96 GB RAM. Alternative: `g6.16xlarge` if more CPU/RAM needed. -- **AMI:** *Deep Learning Base GPU AMI (Ubuntu 22.04) 2024.x* (includes NVIDIA drivers, CUDA 12.4 runtime, conda/mamba). Fallback: latest Ubuntu 22.04 + AWS-provided NVIDIA driver installer. -- **Storage:** 500 GB gp3 EBS (burstable to 1 TB if dataset expansion expected). Throughput 500 MB/s, IOPS ≥ 6k. -- **Networking:** Associate Elastic IP if remote access required; enable placement group optional for multi-instance scale. - -## System Packages (via `apt`) -Install after `sudo apt-get update`: -- `git`, `curl`, `wget`, `jq`, `unzip`, `tmux`, `htop`, `nvtop`, `build-essential`, `pkg-config`, `ninja-build` -- Optional utilities for S3 sync / diagnostics: `s5cmd`, `iftop` (if available in repos). -- No additional image libraries or libjpeg builds needed; PyMuPDF and Pillow wheels ship their own binaries. - -## Python & Environment Strategy -- Use the AMI’s preinstalled **conda/mamba** (`/opt/conda`) to create an isolated env: - ```bash - source /opt/conda/etc/profile.d/conda.sh - conda create -y -n deepseek-ocr python=3.12 - conda activate deepseek-ocr - ``` -- Install [uv](https://github.com/astral-sh/uv) for fast wheel resolution (optional but recommended): - ```bash - curl -LsSf https://astral.sh/uv/install.sh | sh - export PATH="$HOME/.local/bin:$PATH" - ``` -- Install **matching CUDA nightly builds** of PyTorch and vLLM (DeepSeek-OCR support landed Oct 23 2025 and requires nightly until v0.11.1): - ```bash - uv pip install --pre --index-url https://download.pytorch.org/whl/nightly/cu124 torch torchvision torchaudio - uv pip install --pre --extra-index-url https://wheels.vllm.ai/nightly vllm - ``` -- Install runner dependencies (no Pillow-SIMD or flash-attn pin required): - ```bash - uv pip install PyMuPDF pillow numpy rich tqdm huggingface-hub safetensors "transformers>=4.45.0" - uv pip install huggingface_hub[hf_transfer] # optional: accelerates HF downloads - ``` -- Export helpful env vars in `.bashrc` / profile: - ```bash - echo 'export HF_HUB_ENABLE_HF_TRANSFER=1' >> ~/.bashrc - echo 'export HF_HUB_DISABLE_TELEMETRY=1' >> ~/.bashrc - ``` - -## Model / Data Access -- **Hugging Face token** (`HF_TOKEN`) required to download `deepseek-ai/DeepSeek-OCR` if gated; export and run `huggingface-cli login`. -- Preload checkpoints to `/opt/models/DeepSeek-OCR` (or EBS/local NVMe path) to avoid repeated downloads. -- Upload or sync our repository (e.g., `git clone` from internal repo or `scp` / `aws s3 sync`). - -## Multi-GPU Execution Plan -- Use our runner with sharding: launch one process per GPU or a single process with tensor parallelism. - ```bash - CUDA_VISIBLE_DEVICES=0,1,2,3 python run_pdf_ocr_vllm.py \ - --input-dir /data/pdfs \ - --output-dir /data/outputs/deepseek_vllm_outputs_clean_combined \ - --mode clean \ - --tensor-parallel-size 4 \ - --mm-encoder-tp-mode data \ - --num-shards 4 --shard-index $SHARD_ID \ - --batch-pages 16 --gpu-memory-utilization 0.93 \ - --log-level INFO \ - --timeout-seconds 300 # apply via wrapper script during smoke tests - ``` -- For data parallelism, launch 4 separate tmux panes with `--num-shards 4 --shard-index {0..3}`. Use a shared NFS/EBS path for outputs. -- Keep `timeout 300` wrappers during smoke tests; remove for production runs once stable. - -## Monitoring & Housekeeping -- Monitor GPU usage with `watch -n 5 nvidia-smi` and `nvtop`. -- Enable CloudWatch logs/metrics if long jobs. -- Snapshot EBS volume post-run for reproducibility. -- If the instance exposes **local NVMe (instance store)**, mount it (e.g., `/local`) for intermediate PNG caches to reduce EBS churn. - -## Credentials & Secrets -- Store `HF_TOKEN`, `AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY`, and optional Git tokens in AWS Secrets Manager or SSM Parameter Store. Pass them via IAM instance profile or `aws ssm get-parameter` during setup. -- Avoid hardcoding tokens in scripts; use environment exports or `.env` files with restricted permissions. - -## Next Actions Before Launch -1. Prepare environment bootstrap script (`aws/setup_multi_gpu_env.sh`). -2. Stage sample PDFs / URLs (10 Greek-language files) for validation. -3. Confirm S3 bucket or EFS path for long-term storage if outputs exceed instance lifespan. -4. Decide on orchestration (manual tmux vs. SLURM/PM2). - -## Preflight Checklist on Instance -1. `nvidia-smi` → confirm 4× L4 GPUs and driver R550+. -2. `python - <<'PY'` sanity script to report Torch/vLLM versions and CUDA availability. -3. Run single-PDF smoke test with `timeout 300` and `--batch-pages 1` to confirm decoding emits text (no blank outputs). -4. Record baseline throughput (pages/s) before full run; adjust `--batch-pages` while keeping ~1–2 GB VRAM headroom. diff --git a/deepseek-ocr/aws/setup_multi_gpu_env.sh b/deepseek-ocr/aws/setup_multi_gpu_env.sh deleted file mode 100755 index 96f6725..0000000 --- a/deepseek-ocr/aws/setup_multi_gpu_env.sh +++ /dev/null @@ -1,88 +0,0 @@ -#!/usr/bin/env bash -# Bootstrap script for the 4×L4 DeepSeek-OCR environment on an AWS g6 instance. -# Usage: bash setup_multi_gpu_env.sh - -set -euo pipefail - -# --- Configurable paths --- -CONDA_ENV_NAME="deepseek-ocr" -HF_CACHE_DIR="/opt/huggingface" -MODEL_CACHE="/opt/models" - -# --- Sanity checks --- -if [[ $EUID -ne 0 ]]; then - echo "[!] Please run this script with sudo (it needs apt + /opt writes)." >&2 - exit 1 -fi - -# Record start time -START_TS=$(date +%s) - -log() { - echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" -} - -log "Updating apt package index..." -apt-get update -y -apt-get upgrade -y - -log "Installing system prerequisites..." -DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ - build-essential git curl wget jq unzip pkg-config ninja-build \ - libgl1 libglib2.0-0 libtiff-dev zlib1g-dev \ - htop nvtop tmux - -# Ensure conda exists (Deep Learning AMI ships it under /opt/conda) -if [[ ! -x /opt/conda/bin/conda ]]; then - log "Conda not found in /opt/conda; installing Miniconda..." - curl -fsSL https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -o /tmp/miniconda.sh - bash /tmp/miniconda.sh -b -p /opt/conda -fi - -source /opt/conda/etc/profile.d/conda.sh - -# Accept Anaconda TOS in advance (avoids interactive prompts) -/opt/conda/bin/conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/main -/opt/conda/bin/conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/r - -if conda info --envs | awk '{print $1}' | grep -qx "${CONDA_ENV_NAME}"; then - log "Conda env ${CONDA_ENV_NAME} already exists; skipping creation." -else - log "Creating conda env ${CONDA_ENV_NAME} with Python 3.12..." - conda create -y -n "${CONDA_ENV_NAME}" python=3.12 -fi - -log "Activating conda env ${CONDA_ENV_NAME}..." -conda activate "${CONDA_ENV_NAME}" - -log "Ensuring uv package manager is installed..." -if ! command -v uv >/dev/null 2>&1; then - export UV_INSTALL_DIR=/usr/local/bin - curl -LsSf https://astral.sh/uv/install.sh | sh -fi - -log "Installing PyTorch nightly (cu124) stack..." -uv pip install --python "$(which python)" --pre --index-url https://download.pytorch.org/whl/nightly/cu124 torch torchvision torchaudio - -log "Installing vLLM nightly (cu124)..." -uv pip install --python "$(which python)" --pre --extra-index-url https://wheels.vllm.ai/nightly vllm - -log "Installing Python dependencies (PyMuPDF, Pillow, numpy, transformers, etc.)..." -uv pip install --python "$(which python)" PyMuPDF pillow numpy rich tqdm huggingface-hub safetensors "transformers>=4.45.0" -uv pip install --python "$(which python)" 'huggingface_hub[hf_transfer]' - -log "Persisting environment exports..." -mkdir -p /etc/profile.d -cat >/etc/profile.d/deepseek-ocr.sh <\nYou are given a cropped region from a document. " - "Transcribe only the content of this region in markdown/HTML. " - "Leave empty table cells blank (
", re.IGNORECASE | re.DOTALL -) -NBSP_PATTERN = re.compile(r"(?: |\u00A0)+", re.IGNORECASE) -PLACEHOLDER_VALUES = { - "none", - "n/a", - "na", - "null", - "--", - "—", - "–", - "−", - "-", - "•", - "·", - "[[blank page]]", - "[blank]", - "(blank)", -} -PLACEHOLDER_LOGIT_BIAS_VALUE = -1.2 -PLACEHOLDER_BIAS_STRINGS = ( - " None", - " none", - "None", - "none", - " N/A", - " n/a", - "N/A", - "n/a", - " [[Blank page]]", - "[[Blank page]]", -) -DEHYPHEN_PATTERN = re.compile(r"(?<=\w)-\n(?=[a-zα-ωά-ώ])", re.IGNORECASE) -TIKZ_BLOCK_PATTERN = re.compile( - r"\\begin\{tikzpicture\}.*?\\end\{tikzpicture\}", re.DOTALL -) -LATEX_ARRAY_SPAM_PATTERN = re.compile( - r"(?:\[\s*\\begin\{array\}.*?\\end\{array\}\s*\]){3,}", re.DOTALL -) -LATEX_DRAW_SPAM_PATTERN = re.compile( - r"(?:\\draw\s*\([^;]*\);\s*){10,}", re.DOTALL -) -INLINE_LATEX_PATTERN = re.compile(r"\\\((.+?)\\\)") -BLOCK_LATEX_PATTERN = re.compile(r"\\\[\s*(.+?)\s*\\\]", re.DOTALL) -SIMPLE_SUP_PATTERN = re.compile(r"\$\^\{?([A-Za-z0-9+\-]+)\}?\$") -SIMPLE_SUB_PATTERN = re.compile(r"\$_\{?([A-Za-z0-9+\-]+)\}?\$") -CITATION_SUP_PATTERN = re.compile(r"(\d{2,}(?:[A-Za-z]{1,2})?)") - - -@dataclass -class PageJob: - pdf_path: Path - page_index: int - image: Image.Image - is_blank: bool = False - - -@dataclass -class ROIJob: - pdf_path: Path - page_index: int - region_index: int - label: str - bbox: Tuple[int, int, int, int] - image: Image.Image - - -def is_mostly_blank_pix( - pix: fitz.Pixmap, *, tolerance: int = 8, max_fraction: float = 0.0015 -) -> bool: - buf = pix.samples - if not buf: - return True - channels = 4 if pix.alpha else 3 - arr = np.frombuffer(buf, dtype=np.uint8) - if arr.size == 0: - return True - arr = arr.reshape(-1, channels) - if channels == 4: - arr = arr[:, :3] - if arr.size == 0: - return True - if arr.shape[0] > 65536: - samples = arr[::64] - else: - samples = arr - samples16 = samples.astype(np.int16, copy=False) - base = samples16[0] - diff = np.abs(samples16 - base) - if diff.max() <= tolerance: - return True - mask = np.any(diff > tolerance, axis=1) - return mask.mean() <= max_fraction - - -def batched(iterable: Iterable[PageJob], size: int) -> Iterator[List[PageJob]]: - batch: List[PageJob] = [] - for item in iterable: - batch.append(item) - if len(batch) == size: - yield batch - batch = [] - if batch: - yield batch - - -def render_page(pdf_path: Path, page_index: int, dpi: int) -> PageJob: - with fitz.open(pdf_path) as doc: - if page_index >= doc.page_count: - raise IndexError(f"Page {page_index} out of bounds for {pdf_path.name}") - page = doc[page_index] - scale = dpi / 72.0 - matrix = fitz.Matrix(scale, scale) - pix = page.get_pixmap(matrix=matrix, alpha=False) - blank = is_mostly_blank_pix(pix) - image = Image.frombytes("RGB", (pix.width, pix.height), pix.samples).convert("RGB") - return PageJob( - pdf_path=pdf_path, - page_index=page_index, - image=image, - is_blank=blank, - ) - - -def render_pdf( - pdf_path: Path, - dpi: int, - executor: ThreadPoolExecutor, - max_pages: Optional[int] = None, -) -> List[PageJob]: - with fitz.open(pdf_path) as doc: - total_pages = doc.page_count - page_count = total_pages if max_pages is None else min(total_pages, max_pages) - futures = executor.map( - lambda idx: render_page(pdf_path, idx, dpi), range(page_count) - ) - jobs = list(futures) - jobs.sort(key=lambda job: job.page_index) - return jobs - - -def ensure_cuda_visible() -> None: - import torch - - if not torch.cuda.is_available(): - raise RuntimeError("CUDA device not detected; the L4 GPU is required for this script.") - - -def build_llm(args: argparse.Namespace) -> LLM: - llm_kwargs = dict( - model=args.model, - dtype=args.dtype, - enable_prefix_caching=False, - mm_processor_cache_gb=0, - gpu_memory_utilization=args.gpu_memory_utilization, - # Only pass OCR-specific logits processor when present in this vLLM build - logits_processors=( - [NGramPerReqLogitsProcessor] - if NGramPerReqLogitsProcessor is not None - else [] - ), - tensor_parallel_size=args.tensor_parallel_size, - ) - if not args.no_fp8_kv: - llm_kwargs["kv_cache_dtype"] = "fp8_e4m3" - if args.enable_fp8_weights: - llm_kwargs["quantization"] = "fp8" - if args.mm_encoder_tp_mode and args.mm_encoder_tp_mode != "auto": - llm_kwargs["mm_encoder_tp_mode"] = args.mm_encoder_tp_mode - return LLM(**llm_kwargs) - - -DEFAULT_STOP_SEQUENCES = ("<|", "<|", "<|", "|>") - - -def build_sampling_params( - args: argparse.Namespace, mode: str, whitelist_token_ids: set[int] -) -> SamplingParams: - extra_args = dict( - ngram_size=30, - window_size=90, - whitelist_token_ids=list(whitelist_token_ids), - ) - params_kwargs = dict( - temperature=0.0, - max_tokens=args.max_tokens, - skip_special_tokens=False, - stop=list(DEFAULT_STOP_SEQUENCES), - extra_args=extra_args, - ) - return SamplingParams(**params_kwargs) - - -def strip_prompt_echo(text: str, prompt: str) -> str: - lines = [ - line.strip() - for line in prompt.splitlines() - if line.strip() and line.strip() != "" - ] - for line in lines: - escaped = re.escape(line) - pattern = re.compile(rf"(?:{escaped})(?:\s+|$)") - while True: - new_text = pattern.sub("", text, count=1) - if new_text == text: - break - text = new_text.strip() - return text - - -def clean_output( - text: str, *, keep_refdet: bool, metrics: Optional[dict[str, int]] = None -) -> str: - text = text.replace("", "").replace("", "") - if not keep_refdet: - text = REFDET_BLOCK_PATTERN.sub("", text) - pattern = get_special_token_pattern(keep_refdet) - text = pattern.sub("", text) - text = ORPHAN_META_FRAGMENT_PATTERN.sub("", text) - text = LATEX_DRAW_SPAM_PATTERN.sub("", text) - text = TIKZ_BLOCK_PATTERN.sub( - "[[Figure omitted; refer to original page image]]", text - ) - text = LATEX_ARRAY_SPAM_PATTERN.sub( - "[[Matrix omitted; refer to original page image]]", text - ) - - lines: List[str] = [] - for raw_line in text.splitlines(): - line = BOUNDING_BOX_PREFIX_PATTERN.sub("", raw_line) - line = line.replace("
", "").replace("
", "") - stripped = line.strip() - if not stripped: - if lines and lines[-1] == "": - continue - lines.append("") - continue - lines.append(stripped) - - cleaned = "\n".join(lines).strip() - cleaned = INLINE_LATEX_PATTERN.sub(lambda m: f"${m.group(1)}$", cleaned) - cleaned = BLOCK_LATEX_PATTERN.sub(lambda m: f"$${m.group(1).strip()}$$", cleaned) - cleaned = SIMPLE_SUP_PATTERN.sub(r"\1", cleaned) - cleaned = SIMPLE_SUB_PATTERN.sub(r"\1", cleaned) - cleaned = prune_placeholder_cells(cleaned, metrics) - cleaned = drop_empty_tables(cleaned, metrics) - cleaned = re.sub(r"\n{3,}", "\n\n", cleaned) - return cleaned.strip() - - -@lru_cache(maxsize=1) -def get_tokenizer() -> AutoTokenizer: - if CHECKPOINT_DIR.exists(): - model_id = CHECKPOINT_DIR - else: - model_id = ( - "deepseek-ai/DeepSeek-OCR" - if _DEEPSEEK_OCR_AVAILABLE - else "deepseek-ai/DeepSeek-VL2" - ) - return AutoTokenizer.from_pretrained(str(model_id), trust_remote_code=True) - - -def _collect_special_strings(value, bucket: set[str]) -> None: - if isinstance(value, str): - if value: - bucket.add(value) - elif isinstance(value, dict): - for item in value.values(): - _collect_special_strings(item, bucket) - elif isinstance(value, (list, tuple, set)): - for item in value: - _collect_special_strings(item, bucket) - - -@lru_cache(maxsize=2) -def get_special_token_pattern(keep_refdet: bool) -> re.Pattern: - tokenizer = get_tokenizer() - specials: set[str] = set() - _collect_special_strings(tokenizer.all_special_tokens, specials) - _collect_special_strings(tokenizer.special_tokens_map_extended, specials) - specials.update({"<|User|>", "<|Assistant|>", "<|grounding|>", ""}) - if keep_refdet: - specials.difference_update({"<|ref|>", "<|/ref|>", "<|det|>", "<|/det|>"}) - specials = {token for token in specials if token} - if not specials: - return re.compile(r"$^") - variants: set[str] = set() - for token in specials: - variants.add(re.escape(token)) - if "|" in token: - variants.add(re.escape(token.replace("|", FULL_WIDTH_BAR))) - pattern = "|".join(sorted(variants, key=len, reverse=True)) - return re.compile(f"(?:{pattern})") - - -@lru_cache(maxsize=1) -def compute_whitelist_token_ids(additional_tags: Tuple[str, ...] = ()) -> set[int]: - tokenizer = get_tokenizer() - tags = { - "", - "
", - "", - "", - "", - "", - "", - "", - } - tags.update(additional_tags) - token_ids: set[int] = set() - for tag in tags: - encoded = tokenizer.encode(tag, add_special_tokens=False) - token_ids.update(encoded) - return token_ids - - -@lru_cache(maxsize=1) -def get_placeholder_logit_bias() -> dict[int, float]: - tokenizer = get_tokenizer() - bias: dict[int, float] = {} - for candidate in PLACEHOLDER_BIAS_STRINGS: - ids = tokenizer.encode(candidate, add_special_tokens=False) - if len(ids) == 1: - bias[ids[0]] = PLACEHOLDER_LOGIT_BIAS_VALUE - return bias - - -def _normalize_cell_text(fragment: str) -> str: - text = html.unescape(fragment) - text = NBSP_PATTERN.sub(" ", text) - text = re.sub(r"", " ", text, flags=re.IGNORECASE) - text = re.sub(r"<[^>]+>", " ", text) - text = re.sub(r"\s+", " ", text).strip() - return text - - -def _is_placeholder_content(fragment: str) -> bool: - normalized = _normalize_cell_text(fragment) - if not normalized: - return True - lowered = normalized.lower() - compact = re.sub(r"[\s._\-\\/]+", "", lowered) - return lowered in PLACEHOLDER_VALUES or compact in PLACEHOLDER_VALUES - - -def prune_placeholder_cells( - html_text: str, metrics: Optional[dict[str, int]] = None -) -> str: - def replacer(match: re.Match[str]) -> str: - opening, body, closing = match.groups() - if _is_placeholder_content(body): - if metrics is not None: - metrics["placeholder_cells_pruned"] = ( - metrics.get("placeholder_cells_pruned", 0) + 1 - ) - return f"{opening}{closing}" - return f"{opening}{body}{closing}" - - return PLACEHOLDER_CELL_PATTERN.sub(replacer, html_text) - - -def drop_empty_tables( - html_text: str, metrics: Optional[dict[str, int]] = None -) -> str: - def replace_table(match: re.Match[str]) -> str: - table_html = match.group(0) - has_data = False - for cell_match in PLACEHOLDER_CELL_PATTERN.finditer(table_html): - cell_body = cell_match.group(2) - if not _is_placeholder_content(cell_body): - has_data = True - break - if has_data: - return table_html - # If there are elements with content, keep the table. - for header_match in re.finditer( - r"]*>(.*?)", table_html, re.IGNORECASE | re.DOTALL - ): - header_content = _normalize_cell_text(header_match.group(1)) - if header_content: - return table_html - if metrics is not None: - metrics["tables_dropped"] = metrics.get("tables_dropped", 0) + 1 - return "" - - return TABLE_BLOCK_PATTERN.sub(replace_table, html_text) - - -def canonicalize_markdown(text: str) -> str: - text = NBSP_PATTERN.sub(" ", text) - text = re.sub(r"[ \t]+\n", "\n", text) - text = DEHYPHEN_PATTERN.sub("", text) - text = prune_placeholder_cells(text) - text = drop_empty_tables(text) - text = CITATION_SUP_PATTERN.sub(lambda m: f"[^{m.group(1)}]", text) - text = re.sub(r"\n{3,}", "\n\n", text) - return text.strip() - - -def extract_refdet_regions(text: str) -> List[Tuple[str, List[Tuple[float, float, float, float]]]]: - regions: List[Tuple[str, List[Tuple[float, float, float, float]]]] = [] - for match in REFDET_EXTRACT_PATTERN.findall(text): - _, label_text, coords_text = match - label = label_text.strip().lower() - if not label: - continue - try: - coords = ast.literal_eval(coords_text) - except (ValueError, SyntaxError): - continue - boxes: List[Tuple[float, float, float, float]] = [] - if isinstance(coords, list): - for entry in coords: - if ( - isinstance(entry, (list, tuple)) - and len(entry) == 4 - and all(isinstance(v, (int, float)) for v in entry) - ): - boxes.append(tuple(float(v) for v in entry)) - if boxes: - regions.append((label, boxes)) - return regions - - -def convert_box_to_pixels( - box: Tuple[float, float, float, float], width: int, height: int -) -> Tuple[int, int, int, int]: - x1, y1, x2, y2 = box - x1 = max(0, min(width, int(round(x1 / 999.0 * width)))) - y1 = max(0, min(height, int(round(y1 / 999.0 * height)))) - x2 = max(0, min(width, int(round(x2 / 999.0 * width)))) - y2 = max(0, min(height, int(round(y2 / 999.0 * height)))) - if x2 <= x1: - x2 = min(width, x1 + 1) - if y2 <= y1: - y2 = min(height, y1 + 1) - return x1, y1, x2, y2 - - -def collect_roi_jobs( - job: PageJob, - regions: List[Tuple[str, List[Tuple[float, float, float, float]]]], - labels: Sequence[str], - min_area: int, - counters: dict[tuple[str, int], int], -) -> List[ROIJob]: - label_set = {label.lower() for label in labels} - width, height = job.image.size - roi_jobs: List[ROIJob] = [] - for label, boxes in regions: - if label not in label_set: - continue - key = (label, job.page_index) - index_base = counters.get(key, 0) - added = 0 - for box in boxes: - x1, y1, x2, y2 = convert_box_to_pixels(box, width, height) - area = (x2 - x1) * (y2 - y1) - if area < min_area: - continue - crop = job.image.crop((x1, y1, x2, y2)) - added += 1 - region_index = index_base + added - roi_jobs.append( - ROIJob( - pdf_path=job.pdf_path, - page_index=job.page_index, - region_index=region_index, - label=label, - bbox=(x1, y1, x2, y2), - image=crop, - ) - ) - counters[key] = index_base + added - return roi_jobs - - -@contextlib.contextmanager -def vision_mode_override(base_size: int, image_size: int, crop_mode: bool): - # DeepSeek OCR processors may not be present in some vLLM releases; fall back to VL2 - try: - module = importlib.import_module( - "vllm.transformers_utils.processors.deepseek_ocr" - ) - except Exception: # pragma: no cover - capability detection - module = importlib.import_module( - "vllm.transformers_utils.processors.deepseek_vl2" - ) - prev_base = getattr(module, "BASE_SIZE", None) - prev_image = getattr(module, "IMAGE_SIZE", None) - prev_crop = getattr(module, "CROP_MODE", None) - module.BASE_SIZE = base_size - module.IMAGE_SIZE = image_size - module.CROP_MODE = crop_mode - try: - yield - finally: - if prev_base is not None: - module.BASE_SIZE = prev_base - if prev_image is not None: - module.IMAGE_SIZE = prev_image - if prev_crop is not None: - module.CROP_MODE = prev_crop - - -def parse_args() -> argparse.Namespace: - parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument( - "--input-dir", - type=Path, - required=True, - help="Directory containing PDF files to process.", - ) - parser.add_argument( - "--output-dir", - type=Path, - default=None, - help="Directory for combined markdown outputs. Default: /deepseek_vllm_outputs_.", - ) - parser.add_argument( - "--prompt", - type=str, - default=None, - help="Override prompt for the selected mode.", - ) - parser.add_argument( - "--grounded-prompt", - type=str, - default=DEFAULT_GROUNDED_PROMPT, - help="Prompt used in grounded mode (structure + boxes).", - ) - parser.add_argument( - "--clean-prompt", - type=str, - default=DEFAULT_CLEAN_PROMPT, - help="Prompt used in clean mode (structure removed).", - ) - parser.add_argument( - "--roi-prompt", - type=str, - default=DEFAULT_ROI_PROMPT, - help="Prompt for ROI second-pass crops.", - ) - parser.add_argument( - "--mode", - type=str, - default="clean", - choices=("clean", "grounded"), - help="Select output profile. Run twice (grounded & clean) to get both artifacts.", - ) - parser.add_argument( - "--batch-pages", - type=int, - default=12, - help="Number of pages to batch per vLLM generate() call.", - ) - parser.add_argument( - "--max-tokens", - type=int, - default=4096, - help="Maximum tokens to decode per page.", - ) - parser.add_argument( - "--dpi", - type=int, - default=180, - help="Rendering DPI for PDF pages.", - ) - parser.add_argument( - "--cpu-workers", - type=int, - default=8, - help="Thread pool size for PDF rendering.", - ) - # Default model: prefer local checkout if present; otherwise pick a remote id. - default_model = ( - str(CHECKPOINT_DIR) - if CHECKPOINT_DIR.exists() - else ("deepseek-ai/DeepSeek-OCR" if _DEEPSEEK_OCR_AVAILABLE else "deepseek-ai/DeepSeek-VL2") - ) - parser.add_argument( - "--model", - type=str, - default=default_model, - help="Model identifier or local path.", - ) - parser.add_argument( - "--dtype", - type=str, - default="bfloat16", - choices=("bfloat16", "float16", "float32"), - help="Precision for model weights/activations.", - ) - parser.add_argument( - "--no-fp8-kv", - action="store_true", - help="Disable FP8 KV cache.", - ) - parser.add_argument( - "--enable-fp8-weights", - action="store_true", - help="Enable FP8 weight quantization (verify quality).", - ) - parser.add_argument( - "--max-pages", - type=int, - default=None, - help="Optional page cap per PDF.", - ) - parser.add_argument( - "--gpu-memory-utilization", - type=float, - default=0.95, - help="Target fraction of GPU memory for vLLM.", - ) - parser.add_argument( - "--tensor-parallel-size", - type=int, - default=1, - help="Tensor parallel size to pass to vLLM when spanning multiple GPUs.", - ) - parser.add_argument( - "--mm-encoder-tp-mode", - type=str, - choices=("auto", "data", "sequence"), - default="auto", - help="Tensor-parallel strategy for the multimodal encoder (use 'data' for data-parallel fan-out).", - ) - parser.add_argument( - "--skip-existing", - action="store_true", - help="Skip pages whose outputs already exist.", - ) - parser.add_argument( - "--num-shards", - type=int, - default=1, - help="Split the PDF list into this many shards for multi-GPU runs.", - ) - parser.add_argument( - "--shard-index", - type=int, - default=0, - help="0-based index of the shard to process (requires --num-shards > 1).", - ) - parser.add_argument( - "--save-images", - action="store_true", - help="Persist rendered page PNGs alongside markdown outputs.", - ) - parser.add_argument( - "--roi-second-pass", - action="store_true", - help="After grounded run, crop specified regions and re-run clean inference per crop.", - ) - parser.add_argument( - "--roi-label", - action="append", - default=None, - help="Region label to include in ROI second pass (repeatable). Defaults to table/title/paragraph/figure.", - ) - parser.add_argument( - "--roi-min-area", - type=int, - default=2048, - help="Minimum pixel area for ROI crops in second pass.", - ) - parser.add_argument( - "--retry-large", - action="store_true", - help="Re-run pages with missing ref/det matches using Large (1280) vision mode.", - ) - parser.add_argument( - "--retry-label", - action="append", - default=None, - help="Region label that must appear; otherwise trigger Large retry (default: table).", - ) - parser.add_argument( - "--log-level", - type=str, - default="INFO", - help="Logging level.", - ) - parser.add_argument( - "--content-debug", - action="store_true", - help="Include page separators (---pages---) and truncation markers.", - ) - return parser.parse_args() - - -def resolve_prompt(args: argparse.Namespace) -> str: - if args.prompt: - return args.prompt - if args.mode == "grounded": - return args.grounded_prompt - return args.clean_prompt - - -def process_batch( - llm: LLM, - batch: Sequence[PageJob], - params: SamplingParams, - prompt: str, - vision_override: Optional[Tuple[int, int, bool]] = None, -) -> List[Tuple[PageJob, str, int, bool]]: - requests = [ - {"prompt": prompt, "multi_modal_data": {"image": job.image}} for job in batch - ] - manager = ( - vision_mode_override(*vision_override) - if vision_override is not None - else contextlib.nullcontext() - ) - with manager: - outputs = llm.generate(requests, params) - result_tuples: List[Tuple[PageJob, str, int, bool]] = [] - for job, output in zip(batch, outputs): - generated = output.outputs[0] - text = generated.text - token_ids = getattr(generated, "token_ids", ()) - token_count = len(token_ids) - finish_reason = getattr(generated, "finish_reason", None) - token_limit_hit = False - if finish_reason is not None: - reason_text = str(finish_reason).lower() - token_limit_hit = "length" in reason_text or "max_token" in reason_text - if not token_limit_hit: - max_tokens = getattr(params, "max_tokens", None) - if max_tokens is not None and token_count >= max_tokens: - token_limit_hit = True - logging.info( - "Decoded %s#%04d: %d tokens, %d chars (preview=%r)", - job.pdf_path.name, - job.page_index + 1, - token_count, - len(text), - text[:80], - ) - result_tuples.append((job, text, token_count, token_limit_hit)) - return result_tuples - - -def prepare_page_text( - job: PageJob, - text: str, - *, - keep_refdet: bool, - prompt: str, - token_limit_hit: bool = False, - token_limit: Optional[int] = None, - metrics: Optional[dict[str, int]] = None, - content_debug: bool = False, -) -> str: - text = strip_prompt_echo(text, prompt) - cleaned = clean_output(text, keep_refdet=keep_refdet, metrics=metrics) - logging.debug( - "Prepared page %s#%04d (%d chars)", - job.pdf_path.name, - job.page_index + 1, - len(cleaned), - ) - if LEFTOVER_META_PATTERN.search(cleaned): - if metrics is not None: - metrics["residual_meta_pages"] = ( - metrics.get("residual_meta_pages", 0) + 1 - ) - logging.warning( - "[%s] residual meta token markers detected on page %d", - job.pdf_path.name, - job.page_index + 1, - ) - try: - with open("/tmp/debug.txt", "a", encoding="utf-8") as dbg: - dbg.write(f"{job.pdf_path.name}:{job.page_index+1} len={len(cleaned)} raw_len={len(text)}\n") - except OSError: - pass - if token_limit_hit and content_debug: - warning = ( - f"[[Token limit reached at {token_limit} tokens; page may be truncated]]" - if token_limit - else "[[Token limit reached; page may be truncated]]" - ) - cleaned = f"{cleaned.rstrip()}\n\n{warning}" if cleaned else warning - if metrics is not None: - metrics["token_limit_hits"] = metrics.get("token_limit_hits", 0) + 1 - return cleaned - - -def get_blank_page_text(placeholder: str = "[[Blank page]]") -> str: - return placeholder.strip() - - -def stash_page_image(job: PageJob, assets_root: Optional[Path]) -> None: - if assets_root is None: - return - page_dir = assets_root / f"page_{job.page_index+1:04d}" - page_dir.mkdir(parents=True, exist_ok=True) - image_path = page_dir / "page.png" - if not image_path.exists(): - job.image.save(image_path, format="PNG", optimize=True) - - -def run_roi_second_pass( - llm: LLM, - roi_jobs: List[ROIJob], - params: SamplingParams, - prompt: str, - assets_root: Optional[Path], - metrics: Optional[dict[str, int]] = None, -) -> Tuple[int, int]: - if not roi_jobs or assets_root is None: - return 0, 0 - total_tokens = 0 - for batch in batched(roi_jobs, max(1, min(len(roi_jobs), 8))): - requests = [ - {"prompt": prompt, "multi_modal_data": {"image": roi.image}} - for roi in batch - ] - outputs = llm.generate(requests, params) - for roi, output in zip(batch, outputs): - text = output.outputs[0].text - total_tokens += len(output.outputs[0].token_ids) - cleaned = clean_output(text, keep_refdet=False, metrics=metrics) - page_dir = assets_root / f"page_{roi.page_index+1:04d}" - page_dir.mkdir(parents=True, exist_ok=True) - roi_path = page_dir / f"roi_{roi.label}_{roi.region_index:02d}.md" - header = f"\n" - roi_path.write_text(header + cleaned, encoding="utf-8") - return len(roi_jobs), total_tokens - - -def write_combined_markdown( - combined_path: Path, aggregated_pages: dict[int, str], *, content_debug: bool = False -) -> None: - if not aggregated_pages: - return - - sorted_pages = sorted(aggregated_pages) - sections: List[str] = [] - for offset, page_index in enumerate(sorted_pages): - page_text = aggregated_pages[page_index].strip() - if not page_text: - continue - if content_debug and offset > 0: - sections.append("---pages---") - sections.append(page_text) - - body = "\n\n".join(sections).strip() - if body: - body = canonicalize_markdown(body) - combined = f"{body}\n" if body else "" - combined_path.parent.mkdir(parents=True, exist_ok=True) - combined_path.write_text(combined, encoding="utf-8") - - -def main() -> None: - args = parse_args() - logging.basicConfig(level=getattr(logging, args.log_level.upper(), logging.INFO)) - ensure_cuda_visible() - - if args.tensor_parallel_size < 1: - raise ValueError("--tensor-parallel-size must be >= 1.") - if args.num_shards < 1: - raise ValueError("--num-shards must be >= 1.") - if not 0 <= args.shard_index < args.num_shards: - raise ValueError("--shard-index must satisfy 0 <= shard < num_shards.") - - prompt = resolve_prompt(args) - roi_labels = tuple( - label.lower() - for label in ( - args.roi_label - if args.roi_label is not None - else ("table", "title", "paragraph", "figure", "equation", "list") - ) - ) - retry_labels = tuple( - label.lower() - for label in (args.retry_label if args.retry_label else DEFAULT_RETRY_LABELS) - ) - - input_dir = args.input_dir.resolve() - if not input_dir.is_dir(): - raise FileNotFoundError(f"Input directory does not exist: {input_dir}") - - default_dir = ( - "deepseek_vllm_outputs_grounded" - if args.mode == "grounded" - else "deepseek_vllm_outputs_clean" - ) - output_root = ( - args.output_dir.resolve() - if args.output_dir is not None - else input_dir / default_dir - ) - output_root.mkdir(parents=True, exist_ok=True) - - all_pdf_files = sorted(f for f in input_dir.glob("*.pdf") if f.is_file()) - if not all_pdf_files: - raise FileNotFoundError(f"No PDF files found in {input_dir}") - if args.num_shards > 1: - pdf_files = all_pdf_files[args.shard_index :: args.num_shards] - logging.info( - "Shard %d/%d assigned %d of %d PDF(s).", - args.shard_index, - args.num_shards, - len(pdf_files), - len(all_pdf_files), - ) - if not pdf_files: - logging.warning( - "Shard %d/%d has no PDFs to process; exiting.", - args.shard_index, - args.num_shards, - ) - return - else: - pdf_files = all_pdf_files - - whitelist_ids = compute_whitelist_token_ids() - llm = None - try: - llm = build_llm(args) - sampling_params = build_sampling_params(args, args.mode, whitelist_ids) - clean_params = ( - build_sampling_params(args, "clean", whitelist_ids) - if args.roi_second_pass or args.mode == "clean" - else None - ) - - total_pages = 0 - total_tokens = 0 - roi_total_tokens = 0 - run_start = time.perf_counter() - metrics: dict[str, int] = {} - - with ThreadPoolExecutor(max_workers=args.cpu_workers) as executor: - for pdf_path in pdf_files: - combined_name = f"{pdf_path.stem}.md" - combined_path = output_root / combined_name - if args.skip_existing and combined_path.exists(): - logging.info( - "[%s] combined output already exists; skipping.", - pdf_path.name, - ) - continue - - page_jobs = render_pdf(pdf_path, args.dpi, executor, args.max_pages) - if not page_jobs: - continue - - assets_root: Optional[Path] = None - if args.save_images or args.roi_second_pass: - assets_root = output_root / f"{pdf_path.stem}_assets" - assets_root.mkdir(parents=True, exist_ok=True) - - aggregated_pages: dict[int, str] = {} - pdf_start = time.perf_counter() - pages_written = 0 - roi_jobs: List[ROIJob] = [] - roi_counters: dict[tuple[str, int], int] = {} - pages_to_retry: List[PageJob] = [] - - page_token_limit = getattr(sampling_params, "max_tokens", None) - for batch in batched(page_jobs, args.batch_pages): - working_batch = [] - for job in batch: - if job.is_blank: - blank_text = get_blank_page_text() - aggregated_pages[job.page_index] = ( - blank_text if args.content_debug else "" - ) - pages_written += 1 - if args.save_images: - stash_page_image(job, assets_root) - logging.debug( - "[%s] detected blank page %d; skipping inference", - pdf_path.name, - job.page_index + 1, - ) - continue - working_batch.append(job) - if not working_batch: - continue - results = process_batch( - llm, working_batch, sampling_params, prompt, BASE_VISION_CONFIG - ) - for job, raw_text, token_count, token_limit_hit in results: - keep_refdet = args.mode == "grounded" - cleaned = prepare_page_text( - job, - raw_text, - keep_refdet=keep_refdet, - prompt=prompt, - token_limit_hit=token_limit_hit, - token_limit=page_token_limit, - metrics=metrics, - content_debug=bool(args.content_debug), - ) - aggregated_pages[job.page_index] = cleaned - if args.save_images: - stash_page_image(job, assets_root) - - total_tokens += token_count - pages_written += 1 - - if keep_refdet: - regions = extract_refdet_regions(raw_text) - label_counts = {label: len(boxes) for label, boxes in regions} - if args.retry_large and any( - label_counts.get(label, 0) == 0 for label in retry_labels - ): - pages_to_retry.append(job) - if args.roi_second_pass: - roi_jobs.extend( - collect_roi_jobs( - job, - regions, - roi_labels, - args.roi_min_area, - roi_counters, - ) - ) - - if args.retry_large and pages_to_retry: - logging.info( - "[%s] retrying %d page(s) in Large vision mode", - pdf_path.name, - len(pages_to_retry), - ) - if args.roi_second_pass: - retry_indices = {job.page_index for job in pages_to_retry} - roi_jobs = [ - roi for roi in roi_jobs if roi.page_index not in retry_indices - ] - for key in list(roi_counters.keys()): - if key[1] in retry_indices: - roi_counters.pop(key, None) - for batch in batched(pages_to_retry, args.batch_pages): - results_large = process_batch( - llm, batch, sampling_params, prompt, LARGE_VISION_CONFIG - ) - for job, raw_text, token_count, token_limit_hit in results_large: - keep_refdet = args.mode == "grounded" - cleaned = prepare_page_text( - job, - raw_text, - keep_refdet=keep_refdet, - prompt=prompt, - token_limit_hit=token_limit_hit, - token_limit=page_token_limit, - metrics=metrics, - content_debug=bool(args.content_debug), - ) - aggregated_pages[job.page_index] = cleaned - total_tokens += token_count - if args.save_images: - stash_page_image(job, assets_root) - if keep_refdet and args.roi_second_pass: - regions = extract_refdet_regions(raw_text) - roi_jobs.extend( - collect_roi_jobs( - job, - regions, - roi_labels, - args.roi_min_area, - roi_counters, - ) - ) - - if args.roi_second_pass: - if args.mode != "grounded": - logging.warning( - "[%s] ROI second pass requires grounded mode; skipping.", - pdf_path.name, - ) - elif not roi_jobs: - logging.info("[%s] No ROI targets discovered.", pdf_path.name) - elif clean_params is None: - logging.warning( - "[%s] Clean sampling params unavailable; skipping ROI.", - pdf_path.name, - ) - else: - roi_count, roi_tokens = run_roi_second_pass( - llm, - roi_jobs, - clean_params, - args.roi_prompt, - assets_root, - metrics, - ) - roi_total_tokens += roi_tokens - logging.info( - "[%s] ROI second pass completed for %d region(s).", - pdf_path.name, - roi_count, - ) - - write_combined_markdown( - combined_path, aggregated_pages, content_debug=bool(args.content_debug) - ) - - total_pages += pages_written - pdf_elapsed = time.perf_counter() - pdf_start - if pages_written: - logging.info( - "[%s] processed %d page(s) in %.1fs (%.2f pp/s)", - pdf_path.name, - pages_written, - pdf_elapsed, - pages_written / max(pdf_elapsed, 1e-6), - ) - - total_elapsed = time.perf_counter() - run_start - pages_per_sec = total_pages / max(total_elapsed, 1e-6) - tokens_per_sec = total_tokens / max(total_elapsed, 1e-6) - logging.info( - "Completed %d page(s) in %.1fs (%.2f pages/s, %d tokens/s, %d ROI tokens)", - total_pages, - total_elapsed, - pages_per_sec, - int(tokens_per_sec), - roi_total_tokens, - ) - metric_parts: List[str] = [] - if metrics: - metric_parts = [f"{key}={value}" for key, value in metrics.items() if value] - if metric_parts: - logging.info("Sanity metrics: %s", ", ".join(sorted(metric_parts))) - finally: - # Encourage vLLM to shut down GPU/engine threads so process exits cleanly - with contextlib.suppress(Exception): - engine = getattr(llm, "llm_engine", None) if llm is not None else None - if engine is not None: - shutdown = getattr(engine, "shutdown", None) - if callable(shutdown): - shutdown() - # Best-effort cleanup - try: - import torch # type: ignore - - if torch.cuda.is_available(): - torch.cuda.empty_cache() - except Exception: - pass - import gc as _gc - - del llm - _gc.collect() - - -if __name__ == "__main__": - main() diff --git a/pyproject.toml b/pyproject.toml index 6e6672c..4e7ea39 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "maturin" [project] name = "glossapi" -version = "0.1.2" +version = "0.1.3" description = "Academic document processing pipeline with Rust-powered markdown cleaning" authors = [ {name = "GlossAPI Team", email = "glossapi.team@eellak.gr"} diff --git a/repro_rapidocr_onnx/DECISIONS.md b/repro_rapidocr_onnx/DECISIONS.md deleted file mode 100644 index fe66c6a..0000000 --- a/repro_rapidocr_onnx/DECISIONS.md +++ /dev/null @@ -1,34 +0,0 @@ -Decisions & Rationale - -This documents the key choices we made to reach a stable, high-quality Docling + RapidOCR (ONNX) setup for Greek. - -1) Use RapidOCR ONNXRuntime backend -- Why: Docling supports RapidOCR natively and we have PP-OCRv5 Greek rec ONNX. ONNXRuntime-GPU is widely available. -- Avoid: Installing generic `rapidocr` only; use `rapidocr_onnxruntime` to ensure the engine is present and loadable. - -2) Keep layout on CPU in this repro -- Why: Avoids potential NCCL/Torch CUDA issues in varied environments. Docling layout still works, and OCR gains are large. -- If enabling GPU layout: install a matching Torch CUDA build and, if you see NCCL warnings, set `NCCL_P2P_DISABLE=1` and `NCCL_IB_DISABLE=1`. - -3) Require explicit ONNX det/rec paths and auto-locate CLS -- Why: det/rec ONNX are user-provided or locally converted; CLS shape compatibility is tricky, so we auto-locate RapidOCR’s known-good CLS. -- Avoid: Using arbitrary CLS ONNX with mismatched input dims (causes INVALID_ARGUMENT errors in ORT). - -4) Generate Greek keys from Paddle inference.yml -- Why: Recognition requires a character dictionary that matches the rec model labels. Extracting from `inference.yml` guarantees alignment. -- Avoid: Letting RapidOCR infer a dict URL; this fails for the Greek rec model and surfaces as a misleading factory error. - -5) Patch Docling to pass `Rec.rec_keys_path` -- Why: Docling 2.48.0 uses `Rec.keys_path`; RapidOCR expects `Rec.rec_keys_path`. The one-line patch ensures keys are honored. -- How: `scripts/repatch_docling.sh` finds the installed file and patches in place; reapply after upgrades. - -6) Avoid CPU ORT alongside ORT GPU -- Why: Having both `onnxruntime` and `onnxruntime-gpu` in the same venv can confuse provider detection and behavior. -- Fix: Uninstall `onnxruntime` CPU if present and reinstall `onnxruntime-gpu`. - -7) Keep numpy<2 -- Why: Best compatibility with ORT wheels and transitive deps we used. - -8) Caches under user control -- Why: Environments vary; we don’t assume specific paths. The run scripts accept standard env vars if you want non-default cache locations. - diff --git a/repro_rapidocr_onnx/ENVIRONMENT.md b/repro_rapidocr_onnx/ENVIRONMENT.md deleted file mode 100644 index d43def1..0000000 --- a/repro_rapidocr_onnx/ENVIRONMENT.md +++ /dev/null @@ -1,72 +0,0 @@ -Environment Setup (Docling + RapidOCR ONNX) - -Goal: Docling layout + RapidOCR (ONNXRuntime GPU) with Greek PP‑OCRv5 rec. - -Create venv (no system path assumptions) - -``` -python3 -m venv .venv_docling -source .venv_docling/bin/activate -python -m pip install -U pip -python -m pip install -r repro_rapidocr_onnx/requirements.txt -``` - -Critical avoids - -- Don’t install `onnxruntime` CPU alongside `onnxruntime-gpu`. If present, uninstall it: - - `pip uninstall -y onnxruntime` -- Use `rapidocr_onnxruntime` (the ONNX flavor). The meta `rapidocr` alone is not sufficient for Docling integration. -- Keep `numpy<2` in this venv (ORT ABI compatibility). -- Keep layout on CPU unless you add a matching Torch CUDA; if enabling GPU layout later, consider `NCCL_P2P_DISABLE=1`, `NCCL_IB_DISABLE=1`. -- Avoid local package shadowing (e.g., do not add this repo root to `PYTHONPATH` when testing unrelated packages). - -Providers and caches - -- Verify ORT GPU providers: - - `python repro_rapidocr_onnx/scripts/check_ort.py` → should include `CUDAExecutionProvider`. -- Optional: set caches away from `$HOME` (customize to your system): - - `TMPDIR=/path/to/tmp` `XDG_CACHE_HOME=//path/to/cache` `HF_HOME=/path/to/hf` - -Post-install patch (Docling → RapidOCR) — auto-detects site-packages - -Docling 2.48.0 passes `Rec.keys_path` to RapidOCR; RapidOCR expects `Rec.rec_keys_path`. -Reapply after reinstall: - -``` -bash repro_rapidocr_onnx/scripts/repatch_docling.sh -``` - -Optional: enable GPU layout - -If you want Docling’s layout to run on GPU as well (OCR already uses ORT GPU): - -``` -source .venv_docling/bin/activate -pip install --index-url https://download.pytorch.org/whl/cu121 \ - torch==2.5.1 torchvision==0.20.1 -python -c "import torch; print(torch.cuda.is_available(), torch.cuda.get_device_name(0))" -``` - -Optional stability vars if you see NCCL warnings: - -``` -export NCCL_P2P_DISABLE=1 -export NCCL_IB_DISABLE=1 - -Optional: enable Docling formula/code enrichment (LaTeX + code blocks) - -- Enrichment is off by default. To enable with best performance on GPU: - -``` -source .venv_docling/bin/activate -pip install --index-url https://download.pytorch.org/whl/cu121 \ - torch torchvision -``` - -- Then add flags when running: - -``` -bash repro_rapidocr_onnx/scripts/run_onnx.sh ... \ - --docling-formula --formula-batch 8 [--docling-code] -``` -``` diff --git a/repro_rapidocr_onnx/KEYS.md b/repro_rapidocr_onnx/KEYS.md deleted file mode 100644 index a45272e..0000000 --- a/repro_rapidocr_onnx/KEYS.md +++ /dev/null @@ -1,25 +0,0 @@ -Keys (recognizer labels) - -RapidOCR’s ONNX recognizer needs the character dictionary that matches the rec model. For Greek PP‑OCRv5, generate the keys file from the Paddle inference config. - -Steps - -1) Ensure you have Greek Paddle inference folder unpacked: - - `/mnt/data/models/paddlev5/el_PP-OCRv5_mobile_rec_infer/inference.yml` - -2) Generate keys: - -``` -source .venv_docling/bin/activate -python repro_rapidocr_onnx/scripts/extract_keys.py \ - --in-yml /mnt/data/models/paddlev5/el_PP-OCRv5_mobile_rec_infer/inference.yml \ - --out /mnt/data/models/paddlev5/greek_ppocrv5_keys.txt -``` - -3) Use the keys file via `--rec-keys` in the runner. - -Why this matters - -- Without `rec_keys_path`, RapidOCR tries to resolve a dict URL for generic models and fails with a masked factory error. -- The extracted keys ensure Greek letters and diacritics align with your rec ONNX labels. - diff --git a/repro_rapidocr_onnx/MODELS.md b/repro_rapidocr_onnx/MODELS.md deleted file mode 100644 index 90f7029..0000000 --- a/repro_rapidocr_onnx/MODELS.md +++ /dev/null @@ -1,46 +0,0 @@ -Models (ONNX) - -You need three ONNX models: - -- Detection (det): PP‑OCRv5 det ONNX -- Classification (cls): text orientation classifier ONNX -- Recognition (rec): PP‑OCRv5 Greek rec ONNX - -Paths used in our working setup - -- Det: `/mnt/data/models/paddlev5/det_onnx/inference.onnx` -- Rec: `/mnt/data/models/paddlev5/rec_onnx/inference.onnx` (Greek v5) -- Cls: `.venv_docling/lib/python3.10/site-packages/rapidocr/models/ch_ppocr_mobile_v2.0_cls_infer.onnx` - -Notes - -- The classifier must match expected input dims; RapidOCR’s bundled `ch_ppocr_mobile_v2.0_cls_infer.onnx` is compatible. -- If your own cls ONNX fails with input shape errors, switch to the bundled one. - -Conversion options for rec/det - -- Option A: Use preconverted ONNX you already have in `/mnt/data/models/paddlev5/*_onnx/`. -- Option B: Convert Paddle inference → ONNX using RapidAI’s PaddleOCRModelConvert or Paddle’s official tooling. High-level steps: - 1) Obtain Paddle inference folders for PP‑OCRv5 models (det server/mobile, Greek rec). See `find_v5models.md` and `how_toget_models.md`. - 2) Use a converter script (e.g., PaddleOCRModelConvert) to export `inference.onnx` for each. - 3) Verify the ONNX loads with onnxruntime and matches expected input sizes. - -Detector (server) vs mobile - -- Server det is more accurate (slower); mobile det is faster. Use either; both work with RapidOCR. - -Sanity checks - -``` -python - << 'PY' -import onnxruntime as ort -for p in [ - '/mnt/data/models/paddlev5/det_onnx/inference.onnx', - '/mnt/data/models/paddlev5/rec_onnx/inference.onnx', - '/mnt/data/greek_paddleocr_pipeline/.venv_docling/lib/python3.10/site-packages/rapidocr/models/ch_ppocr_mobile_v2.0_cls_infer.onnx', -]: - s = ort.InferenceSession(p, providers=['CUDAExecutionProvider','CPUExecutionProvider']) - print('OK:', p) -PY -``` - diff --git a/repro_rapidocr_onnx/PATCHES.md b/repro_rapidocr_onnx/PATCHES.md deleted file mode 100644 index c603c74..0000000 --- a/repro_rapidocr_onnx/PATCHES.md +++ /dev/null @@ -1,24 +0,0 @@ -Docling rapidocr keys mapping (one-line patch) - -Docling 2.48.0 sets `"Rec.keys_path"` in RapidOCR’s init params; RapidOCR expects `"Rec.rec_keys_path"`. - -Patch location - -- File: `.venv_docling/lib/python3.10/site-packages/docling/models/rapid_ocr_model.py` -- Change: - - From: `"Rec.keys_path": self.options.rec_keys_path,` - - To: `"Rec.rec_keys_path": self.options.rec_keys_path,` - -Reapply after reinstall - -``` -bash repro_rapidocr_onnx/scripts/repatch_docling.sh -``` - -Why it matters - -- Without this, passing `--rec-keys` is ignored and RapidOCR errors when trying to infer a dict URL for the Greek model. - -Note on explicit injection path - -- The repro runner now also sets `rec_keys_path` explicitly when using the explicit ONNX injection path. The patch remains recommended for users who rely on the factory path or run other Docling tools that construct the OCR engine via the factory. diff --git a/repro_rapidocr_onnx/README.md b/repro_rapidocr_onnx/README.md deleted file mode 100644 index d0a4ff5..0000000 --- a/repro_rapidocr_onnx/README.md +++ /dev/null @@ -1,43 +0,0 @@ -Repro: Docling + RapidOCR (ONNX, Greek) - -Purpose - -- Reproduce the working pipeline (Docling layout + RapidOCR ONNX) on any machine without relying on local paths or prior state. -- Capture the exact steps, scripts, and rationale that led to the final effective setup. - -Quick Start - -1) Create venv: `bash scripts/create_venv.sh` -2) Verify GPU providers: `python scripts/check_ort.py` → look for `CUDAExecutionProvider` -3) Patch Docling param mapping: `bash scripts/repatch_docling.sh` -4) Prepare keys file: - - With your Paddle `inference.yml`: `bash scripts/prepare_keys.sh --yml /path/to/inference.yml --out /path/to/greek_keys.txt` - - Or auto-download then extract: `bash scripts/prepare_keys.sh --download --out /path/to/greek_keys.txt` -5) Run ONNX pipeline: - - Basic: `bash scripts/run_onnx.sh --det DET.onnx --rec REC.onnx --keys greek_keys.txt --in INPUT_PDFS --out OUTPUT_DIR [--device cuda:0]` - - Use embedded text, OCR only bitmaps: add `--no-force-ocr` - - Normalize output (default on): `--normalize-output|--no-normalize-output` - - Optional math/code enrichment (Docling CodeFormula, GPU recommended): `--docling-formula [--formula-batch 8] [--docling-code]` - -What’s in this folder (and how they relate) - -- `greek_pdf_ocr.py`: the runner script. Uses Docling’s layout and RapidOCR with options for ONNX/Paddle. CLI flags: - - `--backend onnxruntime|paddle`, `--onnx-det/--onnx-rec/--onnx-cls`, `--rec-keys`, `--images-scale`, `--text-score`, `--device`. -- `requirements.txt`: precise packages to install in `.venv_docling`. -- `ENVIRONMENT.md`: venv creation, provider checks, caches; what to avoid (CPU ORT, wrong RapidOCR flavor, GPU layout surprises). -- `MODELS.md`: what ONNX models you need, how to obtain/convert, and why we auto-locate the CLS model. -- `KEYS.md`: why and how to generate the Greek PP‑OCRv5 keys from Paddle `inference.yml`. -- `PATCHES.md`: the one-line Docling patch that maps `Rec.rec_keys_path` so keys are honored by RapidOCR. -- `RUN.md`: the step-by-step flow that ties the above together (create venv → patch → keys → run). -- `TROUBLESHOOTING.md`: symptoms → fixes for the issues we actually hit (factory masking, ORT CPU/GPU collision, CLS shape errors, missing keys). -- `scripts/`: automation for all of the above (venv creation, patching, keys extraction, ORT check, and final run). - -History and rationale (why we did these steps) - -- The Docling factory error (“No class found 'rapidocr'”) masked underlying RapidOCR init errors. We verified by constructing the model directly to see real exceptions. -- RapidOCR requires the Greek recognition keys; without them it tries to infer a dict URL and fails. We extract keys from the Paddle `inference.yml` to guarantee label alignment. -- Docling 2.48.0 passes `Rec.keys_path`; RapidOCR expects `Rec.rec_keys_path`. The provided patch ensures keys are wired correctly. -- Having `onnxruntime` CPU installed alongside `onnxruntime-gpu` confused provider reporting. We uninstall CPU ORT and stick to ORT GPU. -- A mismatched CLS ONNX caused input shape errors. We auto-locate RapidOCR’s packaged `ch_ppocr_mobile_v2.0_cls_infer.onnx`, which is shape-compatible. - -Follow RUN.md for the exact sequence; use TROUBLESHOOTING.md if any step reports an error. diff --git a/repro_rapidocr_onnx/RUN.md b/repro_rapidocr_onnx/RUN.md deleted file mode 100644 index 77dee61..0000000 --- a/repro_rapidocr_onnx/RUN.md +++ /dev/null @@ -1,86 +0,0 @@ -Run Guide - -Assumptions - -- You have `.venv_docling` for Docling + RapidOCR ORT -- You have detection/recognition ONNX files (paths are arguments; no hard-coded locations) -- Optional: set cache env vars as desired (not required) - -1) Create venv and install - -``` -python3 -m venv .venv_docling -source .venv_docling/bin/activate -pip install -U pip -pip install -r repro_rapidocr_onnx/requirements.txt -pip uninstall -y onnxruntime || true -``` - -2) Verify providers - -``` -python repro_rapidocr_onnx/scripts/check_ort.py -``` - -3) Patch Docling once - -``` -bash repro_rapidocr_onnx/scripts/repatch_docling.sh -``` - -4) Generate Greek keys - -``` -python repro_rapidocr_onnx/scripts/extract_keys.py \ - --in-yml /mnt/data/models/paddlev5/el_PP-OCRv5_mobile_rec_infer/inference.yml \ - --out /mnt/data/models/paddlev5/greek_ppocrv5_keys.txt -``` - -5) Run the pipeline (ONNX) — auto-locates packaged CLS model - -``` -bash repro_rapidocr_onnx/scripts/run_onnx.sh \ - --det /path/to/det/inference.onnx \ - --rec /path/to/rec/inference.onnx \ - --keys /path/to/greek_ppocrv5_keys.txt \ - --in /path/to/input_pdfs \ - --out /path/to/output_dir \ - --device cuda:0 \ - --text-score 0.45 \ - --images-scale 1.25 \ - --no-force-ocr \ - --normalize-output -``` - -Outputs - -- Per-PDF `.md` and `.json` in the output directory - -Optional: Enable GPU layout and Docling formula/code enrichment - -Docling’s layout runs on GPU if a CUDA-enabled PyTorch is present and you pass `--device cuda:0` (already included above). - -1) Install Torch CUDA in this venv (choose a CUDA build matching your driver; cu121 is a safe default): - -``` -source .venv_docling/bin/activate -pip install --index-url https://download.pytorch.org/whl/cu121 \ - torch==2.5.1 torchvision==0.20.1 -``` - -2) Verify and optionally enable enrichment flags: - -``` -python -c "import torch; print(torch.cuda.is_available(), torch.cuda.get_device_name(0))" -bash repro_rapidocr_onnx/scripts/run_onnx.sh ... --docling-formula --formula-batch 8 [--docling-code] -``` - -3) (Optional) Stability envs and cache path: - -``` -export NCCL_P2P_DISABLE=1 -export NCCL_IB_DISABLE=1 -export DOCLING_CACHE_DIR=/path/to/docling_cache -``` - -4) Re-run `scripts/run_onnx.sh` with `--device cuda:0` (unchanged). Layout timings will appear under the `layout` key in `.metrics.json`. diff --git a/repro_rapidocr_onnx/TROUBLESHOOTING.md b/repro_rapidocr_onnx/TROUBLESHOOTING.md deleted file mode 100644 index e25c8aa..0000000 --- a/repro_rapidocr_onnx/TROUBLESHOOTING.md +++ /dev/null @@ -1,37 +0,0 @@ -Troubleshooting - -Factory error: “No class found with the name 'rapidocr'” - -- This message often masks underlying RapidOCR init errors. -- Reveal root cause by constructing `RapidOcrModel` directly in a REPL: - - Import `RapidOcrModel` and pass options; examine the thrown exception. - -RapidOCR not registered - -- Ensure `import docling.models.rapid_ocr_model` happens before building `DocumentConverter`. -- Ensure `rapidocr_onnxruntime` is installed (not only `rapidocr`). -- Set `allow_external_plugins=True` in pipeline options. - -ORT CPU vs GPU confusion - -- If `onnxruntime` CPU is installed alongside `onnxruntime-gpu`, providers may be inconsistent. -- Uninstall CPU package: `pip uninstall -y onnxruntime`. Reinstall ORT GPU if needed. - -Classifier shape error (80×160 expected) - -- Symptom: ONNXRuntime INVALID_ARGUMENT on `x` input dims. -- Fix: use RapidOCR’s packaged `ch_ppocr_mobile_v2.0_cls_infer.onnx` for `--onnx-cls`. - -Missing dict_url / keys file - -- Symptom: `Missing key dict_url` from RapidOCR; factory error follows. -- Fix: generate keys from the Greek Paddle `inference.yml` and pass via `--rec-keys`. - -Keys ignored - -- Ensure the Docling patch is applied so RapidOCR receives `Rec.rec_keys_path`. - -NCCL warnings (if enabling GPU layout) - -- Set `NCCL_P2P_DISABLE=1` and `NCCL_IB_DISABLE=1` as needed. - diff --git a/repro_rapidocr_onnx/greek_pdf_ocr.py b/repro_rapidocr_onnx/greek_pdf_ocr.py deleted file mode 100644 index bd1a5b2..0000000 --- a/repro_rapidocr_onnx/greek_pdf_ocr.py +++ /dev/null @@ -1,561 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -Batch OCR for Greek academic PDFs with Docling + RapidOCR. - -Intent: -- Force OCR on every page (ignore embedded PDF text) -- Use Docling layout model (GPU if available) for better structure -- Default OCR backend: RapidOCR with Paddle (PP-OCRv5 Greek) -- Optional ONNX backend (will attempt to resolve ONNX models) - -Outputs: per-PDF Markdown (.md) and structured JSON (.json) -""" -from __future__ import annotations - -import argparse -import json -import tempfile -import unicodedata -import re -from pathlib import Path -from typing import Iterable - -from docling.datamodel.base_models import InputFormat -from docling.datamodel.pipeline_options import ( - AcceleratorDevice, - AcceleratorOptions, - LayoutOptions, - PdfPipelineOptions, - RapidOcrOptions, - TableFormerMode, - TableStructureOptions, -) -from docling.document_converter import ( - ConversionResult, - DocumentConverter, - PdfFormatOption, -) -from docling.datamodel.settings import settings -from typing import Dict, Any, List - -# Ensure RapidOCR model registers with Docling's factory before pipeline init -import docling.models.rapid_ocr_model # force-register 'rapidocr' class - - -# --- Normalization helpers (helps math-heavy Unicode stability) --- -_ZW_RE = re.compile(r"[\u200B\u200C\u200D\uFEFF]") # zero-width space/joiners + BOM - - -def _normalize_str(s: str) -> str: - s = unicodedata.normalize("NFC", s) - s = _ZW_RE.sub("", s) - return s - - -def _normalize_obj(obj: Any) -> Any: - if isinstance(obj, str): - return _normalize_str(obj) - if isinstance(obj, list): - return [_normalize_obj(x) for x in obj] - if isinstance(obj, dict): - return {k: _normalize_obj(v) for k, v in obj.items()} - return obj - - -# (removed heuristic math wrapping; rely on explicit math OCR injection instead) - - -""" -External math OCR hook removed. Docling's CodeFormula enrichment is the only math path. -""" - - -def iter_pdfs(root: Path) -> Iterable[Path]: - for p in root.rglob("*.pdf"): - if p.is_file(): - yield p - - -def ensure_parent(path: Path) -> None: - path.parent.mkdir(parents=True, exist_ok=True) - - -def make_pipeline_options( - backend: str, - device: str, - text_score: float, - hf_cache_dir: str | None, - onnx_det: str | None = None, - onnx_rec: str | None = None, - onnx_cls: str | None = None, - images_scale: float = 1.25, - rec_keys: str | None = None, - formula_enrichment: bool = False, - code_enrichment: bool = False, - force_ocr: bool = True, - num_threads: int = 11, -) -> PdfPipelineOptions: - # Enforce 11 threads for reproducibility/comparison - acc = AcceleratorOptions( - num_threads=int(num_threads), - device=AcceleratorDevice.CUDA if device.lower().startswith("cuda") else AcceleratorDevice.CPU, - ) - - ocr_opts = RapidOcrOptions( - backend=backend, # 'paddle' or 'onnxruntime' - lang=["el", "en"], # Greek + English - force_full_page_ocr=bool(force_ocr), - use_det=True, - use_cls=True, - use_rec=True, - text_score=text_score, - print_verbose=False, - ) - - if backend == "onnxruntime": - if onnx_det and onnx_rec and onnx_cls: - ocr_opts.det_model_path = onnx_det - ocr_opts.rec_model_path = onnx_rec - ocr_opts.cls_model_path = onnx_cls - if rec_keys: - ocr_opts.rec_keys_path = rec_keys - else: - try: - from huggingface_hub import snapshot_download # type: ignore - except Exception as e: # pragma: no cover - raise RuntimeError( - "Provide --onnx-det/--onnx-rec/--onnx-cls or install huggingface_hub to auto-download." - ) from e - base = Path( - snapshot_download( - repo_id="RapidAI/RapidOCR", - cache_dir=hf_cache_dir, - local_files_only=False, - allow_patterns=[ - "onnx/PP-OCRv5/det/*", - "onnx/PP-OCRv5/rec/*", - "onnx/PP-OCRv4/cls/*", - ], - ) - ) - det = base / "onnx" / "PP-OCRv5" / "det" / "ch_PP-OCRv5_server_det.onnx" - cand = [ - base / "onnx" / "PP-OCRv5" / "rec" / "el_PP-OCRv5_rec_server_infer.onnx", - base / "onnx" / "PP-OCRv5" / "rec" / "latin_PP-OCRv5_rec_server_infer.onnx", - base / "onnx" / "PP-OCRv5" / "rec" / "en_PP-OCRv5_rec_server_infer.onnx", - ] - rec = next((c for c in cand if c.exists()), None) - cls = base / "onnx" / "PP-OCRv4" / "cls" / "ch_ppocr_mobile_v2.0_cls_infer.onnx" - if not det.exists() or rec is None or not cls.exists(): - raise FileNotFoundError( - "Provide explicit Greek ONNX paths via --onnx-det/--onnx-rec/--onnx-cls; HF bundle may lack Greek." - ) - ocr_opts.det_model_path = str(det) - ocr_opts.rec_model_path = str(rec) - ocr_opts.cls_model_path = str(cls) - if rec_keys: - ocr_opts.rec_keys_path = rec_keys - - table_opts = TableStructureOptions(mode=TableFormerMode.ACCURATE) - - opts = PdfPipelineOptions( - accelerator_options=acc, - ocr_options=ocr_opts, - layout_options=LayoutOptions(), - do_ocr=True, - do_table_structure=True, - do_formula_enrichment=bool(formula_enrichment), - do_code_enrichment=bool(code_enrichment), - force_backend_text=False, - generate_parsed_pages=False, - table_structure_options=table_opts, - allow_external_plugins=True, - ) - - # Nudge thin glyphs for better det/rec on math-heavy Greek - try: - setattr(opts, "images_scale", images_scale) - except Exception: - pass - - # Belt-and-suspenders in case older Docling ignores ctor flag - try: - setattr(opts, "allow_external_plugins", True) - except Exception: - pass - - return opts - - -def convert_pdf(converter: DocumentConverter, pdf_path: Path) -> ConversionResult: - return converter.convert(source=str(pdf_path)) - - -def export_results( - conv: ConversionResult, - out_dir: Path, - pdf_path: Path, - normalize_output: bool = False, -) -> None: - doc = conv.document - md_path = out_dir / f"{pdf_path.stem}.md" - json_path = out_dir / f"{pdf_path.stem}.json" - metrics_path = out_dir / f"{pdf_path.stem}.metrics.json" - ensure_parent(md_path) - ensure_parent(json_path) - md = doc.export_to_markdown() - as_dict = doc.export_to_dict() - if normalize_output: - md = _normalize_str(md) - md_path.write_text(md, encoding="utf-8") - if normalize_output: - as_dict = _normalize_obj(as_dict) - json_path.write_text(json.dumps(as_dict, ensure_ascii=False, indent=2), encoding="utf-8") - - # Export timings if profiling is enabled - try: - metrics: Dict[str, Any] = {"file": str(pdf_path), "timings": {}} - def _quantiles(vals: List[float], q: float) -> float: - if not vals: - return 0.0 - s = sorted(vals) - idx = int(round((len(s)-1) * q)) - return float(s[idx]) - for key, item in conv.timings.items(): - times = list(item.times) - count = int(item.count) - total = float(sum(times)) if times else 0.0 - avg = float(total / count) if count else 0.0 - metrics["timings"][key] = { - "scope": str(item.scope.value) if hasattr(item, "scope") else "unknown", - "count": count, - "total_sec": total, - "avg_sec": avg, - "p50_sec": _quantiles(times, 0.50), - "p90_sec": _quantiles(times, 0.90), - "times_sec": times, - } - metrics_path.write_text(json.dumps(metrics, ensure_ascii=False, indent=2), encoding="utf-8") - except Exception: - # Do not fail conversion because of metrics - pass - - -def _compute_per_page_metrics(conv: ConversionResult) -> Dict[str, Any]: - """Compute per-page metrics from conv.timings and document export. - - Returns dict with keys: - - file: str - - page_count: int - - pages: list of {page_no, ocr_sec, parse_sec, layout_sec, table_sec, formula_count, formula_chars, code_count, doc_enrich_share_sec} - - totals: {doc_enrich_total_sec: float} - """ - res: Dict[str, Any] = {"file": str(getattr(conv.input.file, 'name', 'unknown'))} - doc = conv.document - try: - page_count = len(doc.pages) # type: ignore[attr-defined] - except Exception: - page_count = 0 - res["page_count"] = int(page_count) - - # Build timings map - timings: Dict[str, Dict[str, Any]] = {} - try: - for key, item in conv.timings.items(): - times = list(item.times) - timings[key] = { - "scope": str(getattr(getattr(item, 'scope', None), 'value', 'unknown')), - "times": times, - "total": float(sum(times)) if times else float(getattr(item, 'total', 0.0)), - } - except Exception: - pass - - def _page_times(key: str) -> List[float]: - arr = timings.get(key, {}).get("times", []) or [] - if page_count and len(arr) == page_count: - return [float(x) for x in arr] - # If lengths differ, pad/truncate to page_count - if page_count: - arr = (arr + [0.0] * page_count)[:page_count] - return [float(x) for x in arr] - - ocr_times = _page_times("ocr") - parse_times = _page_times("page_parse") - layout_times = _page_times("layout") - table_times = _page_times("table_structure") - - # Content counts per page (formulas/code), with sanitization and capping - formula_count = [0] * max(1, page_count) - formula_chars = [0] * max(1, page_count) - formula_trunc = [0] * max(1, page_count) - formula_trunc_chars = [0] * max(1, page_count) - code_count = [0] * max(1, page_count) - try: - as_dict = doc.export_to_dict() - # Sanitization helpers - import re as _re - _run_pat = _re.compile(r"\\\\\s*&(?P(?:\\quad|\\;|\\:|\\,|\\\\s|\s){200,})") - _ws_collapse = _re.compile(r"(?:(?:\\quad|\\;|\\:|\\,|\\\\s)|\s){2,}") - _CAP = 3000 - def _sanitize(s: str) -> tuple[str, int]: - """Return (text, dropped_chars). Apply whitespace-run truncation and length cap, collapse runs.""" - dropped = 0 - m = _run_pat.search(s) - if m: - s_new = s[: m.start('ws')] - dropped += len(s) - len(s_new) - s = s_new - if len(s) > _CAP: - cut = s.rfind('\\\\', 0, _CAP) - if cut < 0: - cut = _CAP - dropped += len(s) - cut - s = s[:cut] - # collapse excessive whitespace macros (optional; cheap) - s2 = _ws_collapse.sub(' ', s) - # do not count collapsed removal toward dropped (cosmetic) - return s2, dropped - def _emit(label: str, counter: List[int], charsum: List[int] | None = None): - for node in as_dict.get("texts", []): - if str(node.get("label")) != label: - continue - txt_raw = str(node.get("text") or node.get("orig") or "") - txt, dropped = _sanitize(txt_raw) if label == 'formula' else (txt_raw, 0) - ch = len(txt) - for prov in node.get("prov", []) or []: - pno = int(prov.get("page_no") or 0) - if pno >= 1 and pno <= len(counter): - counter[pno - 1] += 1 - if charsum is not None: - charsum[pno - 1] += ch - if label == 'formula' and dropped: - formula_trunc[pno - 1] += 1 - formula_trunc_chars[pno - 1] += int(dropped) - _emit("formula", formula_count, formula_chars) - _emit("code", code_count, None) - except Exception: - pass - - # Distribute document-level enrichment time across pages (estimated) - try: - doc_enrich_total = float(timings.get("doc_enrich", {}).get("total", 0.0)) - except Exception: - doc_enrich_total = 0.0 - res["totals"] = {"doc_enrich_total_sec": doc_enrich_total} - shares = [0.0] * max(1, page_count) - if doc_enrich_total and page_count: - s = float(sum(formula_chars)) or float(sum(formula_count)) or 0.0 - if s > 0: - base = formula_chars if sum(formula_chars) > 0 else formula_count - shares = [doc_enrich_total * (float(x) / s) for x in base] - else: - # no formulas, keep zeros - shares = [0.0] * page_count - - rows: List[Dict[str, Any]] = [] - n = max(page_count, len(ocr_times), len(parse_times)) - for i in range(n): - rows.append({ - "page_no": i + 1, - "ocr_sec": float(ocr_times[i]) if i < len(ocr_times) else 0.0, - "parse_sec": float(parse_times[i]) if i < len(parse_times) else 0.0, - "layout_sec": float(layout_times[i]) if i < len(layout_times) else 0.0, - "table_sec": float(table_times[i]) if i < len(table_times) else 0.0, - "formula_count": int(formula_count[i]) if i < len(formula_count) else 0, - "formula_chars": int(formula_chars[i]) if i < len(formula_chars) else 0, - "formula_truncated": int(formula_trunc[i]) if i < len(formula_trunc) else 0, - "formula_truncated_chars": int(formula_trunc_chars[i]) if i < len(formula_trunc_chars) else 0, - "code_count": int(code_count[i]) if i < len(code_count) else 0, - "doc_enrich_share_sec": float(shares[i]) if i < len(shares) else 0.0, - }) - res["pages"] = rows - return res - - -def main() -> None: - ap = argparse.ArgumentParser(description="Batch Greek OCR with Docling + RapidOCR") - ap.add_argument("input_dir", type=Path) - ap.add_argument("output_dir", type=Path) - # Enforce ONNXRuntime backend for GPU OCR; disallow Paddle in these tests - ap.add_argument("--backend", default="onnxruntime", choices=["onnxruntime"], help="OCR backend (GPU-only ONNXRuntime)") - ap.add_argument("--device", default="cuda:0", help="Docling accelerator device, e.g., cuda:0 or cpu") - ap.add_argument("--text-score", type=float, default=0.50) - ap.add_argument("--hf-cache-dir", type=str, default=None) - ap.add_argument("--onnx-det", type=str, required=True, help="Path to detection ONNX (inference.onnx)") - ap.add_argument("--onnx-rec", type=str, required=True, help="Path to recognition ONNX (inference.onnx)") - ap.add_argument("--onnx-cls", type=str, required=True, help="Path to classifier ONNX (inference.onnx)") - ap.add_argument("--images-scale", type=float, default=1.25, help="Raster scale factor before OCR (e.g., 1.25–1.5)") - ap.add_argument("--rec-keys", type=str, required=True, help="Path to recognition keys dict (ppocr keys)") - ap.add_argument("--force-ocr", dest="force_ocr", action="store_true", help="Force full-page OCR (ignore embedded text)") - ap.add_argument("--no-force-ocr", dest="force_ocr", action="store_false", help="Let Docling decide when to OCR (use embedded text when available)") - ap.set_defaults(force_ocr=True) - ap.add_argument("--docling-formula", dest="docling_formula", action="store_true", help="Enable Docling formula enrichment (CodeFormula, runs on GPU)") - ap.add_argument("--no-docling-formula", dest="docling_formula", action="store_false") - ap.set_defaults(docling_formula=False) - ap.add_argument("--formula-batch", type=int, default=5, help="Docling CodeFormula batch size (default 5)") - ap.add_argument("--docling-code", dest="docling_code", action="store_true", help="Enable Docling code enrichment (shares model with formula)") - ap.add_argument("--no-docling-code", dest="docling_code", action="store_false") - ap.set_defaults(docling_code=False) - # Output normalization (helps math-heavy pages by stabilizing Unicode forms) - ap.add_argument("--normalize-output", dest="normalize_output", action="store_true") - ap.add_argument("--no-normalize-output", dest="normalize_output", action="store_false") - ap.set_defaults(normalize_output=True) - # (external math OCR removed; rely on Docling formula enrichment only) - args = ap.parse_args() - - if not args.input_dir.exists(): - raise SystemExit(f"Input dir not found: {args.input_dir}") - args.output_dir.mkdir(parents=True, exist_ok=True) - - # Enable per-stage timing collection so we can export metrics - try: - settings.debug.profile_pipeline_timings = True - except Exception: - pass - - # GPU-only preflight & formula tuning - # 1) ORT GPU providers - try: - import onnxruntime as ort # type: ignore - providers = ort.get_available_providers() - if "CUDAExecutionProvider" not in providers: - raise SystemExit(f"GPU-only policy: onnxruntime does not report CUDAExecutionProvider (providers={providers}). Install onnxruntime-gpu and NVIDIA drivers.") - except Exception as e: - raise SystemExit(f"GPU-only policy: onnxruntime-gpu not available: {e}") - # 2) Torch CUDA for formula enrichment - if args.docling_formula: - try: - import torch # type: ignore - if not torch.cuda.is_available(): - raise SystemExit("GPU-only policy: Torch CUDA not available but formula enrichment requested. Install CUDA-enabled PyTorch.") - try: - torch.set_float32_matmul_precision('high') - except Exception: - pass - try: - from docling.models.code_formula_model import CodeFormulaModel # type: ignore - if isinstance(args.formula_batch, int) and args.formula_batch > 0: - CodeFormulaModel.elements_batch_size = int(args.formula_batch) # type: ignore[attr-defined] - except Exception: - pass - except Exception as e: - raise SystemExit(f"GPU-only policy: Torch CUDA preflight failed: {e}") - # 3) Model paths exist - import os as _os - for pth, name in ((args.onnx_det, 'det'), (args.onnx_rec, 'rec'), (args.onnx_cls, 'cls'), (args.rec_keys, 'keys')): - if not pth or not _os.path.isfile(pth): - raise SystemExit(f"GPU-only policy: Missing or unreadable {name} path: {pth}") - - opts = make_pipeline_options( - backend="onnxruntime", - device=args.device, - text_score=args.text_score, - hf_cache_dir=args.hf_cache_dir, - onnx_det=args.onnx_det, - onnx_rec=args.onnx_rec, - onnx_cls=args.onnx_cls, - images_scale=args.images_scale, - rec_keys=args.rec_keys, - formula_enrichment=args.docling_formula, - code_enrichment=args.docling_code, - force_ocr=args.force_ocr, - ) - - # If ONNX backend with explicit ONNX paths is provided and PdfPipeline is available, - # build RapidOCR explicitly and inject into the pipeline to avoid factory name quirks. - try: - from docling.models.rapid_ocr_model import RapidOcrModel # type: ignore - # Prefer the standard pipeline class that supports OCR injection - try: - from docling.pipelines.standard_pdf_pipeline import StandardPdfPipeline # type: ignore - except Exception: - from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline # type: ignore - except Exception: - RapidOcrModel = None # type: ignore - StandardPdfPipeline = None # type: ignore - - if ( - args.backend == "onnxruntime" - and args.onnx_det and args.onnx_rec and args.onnx_cls - and RapidOcrModel is not None and 'StandardPdfPipeline' in globals() and StandardPdfPipeline is not None - ): - try: - ocr_opts = RapidOcrOptions( - backend="onnxruntime", - lang=["el", "en"], - force_full_page_ocr=True, - use_det=True, - use_cls=True, - use_rec=True, - text_score=args.text_score, - det_model_path=args.onnx_det, - rec_model_path=args.onnx_rec, - cls_model_path=args.onnx_cls, - print_verbose=False, - ) - if args.rec_keys: - # Ensure Greek keys are respected in the explicit injection path - ocr_opts.rec_keys_path = args.rec_keys - acc = opts.accelerator_options - # Docling 2.48.0 RapidOcrModel signature: (enabled, artifacts_path, options, accelerator_options) - ocr_model = RapidOcrModel(True, None, ocr_opts, acc) # type: ignore[arg-type] - pipeline = StandardPdfPipeline(opts, ocr_model=ocr_model) # type: ignore - - found = False - for pdf in iter_pdfs(args.input_dir): - found = True - try: - conv = pipeline.convert(source=str(pdf)) - export_results( - conv, - args.output_dir, - pdf, - normalize_output=args.normalize_output, - ) - print(f"[OK] {pdf}") - except Exception as e: - print(f"[FAIL] {pdf}: {e}") - if not found: - raise SystemExit(f"No PDFs found under {args.input_dir}") - return - except Exception as e: - print(f"[WARN] Explicit RapidOCR injection failed, falling back to factory: {e}") - - # Factory path (ONNX only; Paddle path disabled for GPU-only policy) - converter = DocumentConverter( - format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=opts)} - ) - found = False - for pdf in iter_pdfs(args.input_dir): - found = True - try: - conv = convert_pdf(converter, pdf) - export_results( - conv, - args.output_dir, - pdf, - normalize_output=args.normalize_output, - ) - # Emit per-page metrics file and per-page logs - try: - from pathlib import Path as _Path - per_page = _compute_per_page_metrics(conv) - (_Path(args.output_dir) / f"{pdf.stem}.per_page.metrics.json").write_text( - json.dumps(per_page, ensure_ascii=False, indent=2), encoding="utf-8" - ) - # Console summary per page - for row in per_page.get("pages", []): - print(f"[PAGE] {pdf.name} p{row['page_no']}: parse={row['parse_sec']:.3f}s ocr={row['ocr_sec']:.3f}s formulas={row['formula_count']} code={row['code_count']}") - except Exception as e: - print(f"[WARN] per-page metrics failed for {pdf}: {e}") - print(f"[OK] {pdf}") - except Exception as e: - print(f"[FAIL] {pdf}: {e}") - if not found: - raise SystemExit(f"No PDFs found under {args.input_dir}") - - -if __name__ == "__main__": - main() diff --git a/repro_rapidocr_onnx/requirements.txt b/repro_rapidocr_onnx/requirements.txt deleted file mode 100644 index 4f94361..0000000 --- a/repro_rapidocr_onnx/requirements.txt +++ /dev/null @@ -1,6 +0,0 @@ -docling[rapidocr]==2.48.0 -rapidocr_onnxruntime==1.4.4 -onnxruntime-gpu==1.18.1 -numpy<2 -pyyaml>=6.0 -tqdm>=4.67 diff --git a/repro_rapidocr_onnx/scripts/check_ort.py b/repro_rapidocr_onnx/scripts/check_ort.py deleted file mode 100644 index e1ba121..0000000 --- a/repro_rapidocr_onnx/scripts/check_ort.py +++ /dev/null @@ -1,4 +0,0 @@ -#!/usr/bin/env python3 -import onnxruntime as ort -print('ORT providers:', ort.get_available_providers()) - diff --git a/repro_rapidocr_onnx/scripts/create_venv.sh b/repro_rapidocr_onnx/scripts/create_venv.sh deleted file mode 100755 index f9f6116..0000000 --- a/repro_rapidocr_onnx/scripts/create_venv.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail -cd "$(dirname "$0")/../.." # repo root - -python3 -m venv .venv_docling -source .venv_docling/bin/activate -python -m pip install -U pip -pip install -r repro_rapidocr_onnx/requirements.txt -# Avoid CPU ORT shadowing -pip uninstall -y onnxruntime || true -echo "Venv ready: .venv_docling" - diff --git a/repro_rapidocr_onnx/scripts/extract_keys.py b/repro_rapidocr_onnx/scripts/extract_keys.py deleted file mode 100644 index 0f21963..0000000 --- a/repro_rapidocr_onnx/scripts/extract_keys.py +++ /dev/null @@ -1,23 +0,0 @@ -#!/usr/bin/env python3 -import argparse -from pathlib import Path -import yaml - -def main(): - ap = argparse.ArgumentParser() - ap.add_argument('--in-yml', required=True, help='Path to inference.yml for Greek rec') - ap.add_argument('--out', required=True, help='Output keys file path') - args = ap.parse_args() - - data = yaml.safe_load(Path(args.in_yml).read_text(encoding='utf-8')) - chars = data['PostProcess']['character_dict'] - out = Path(args.out) - out.parent.mkdir(parents=True, exist_ok=True) - with out.open('w', encoding='utf-8') as f: - for ch in chars: - f.write(("'" if ch == "'" else ch) + "\n") - print(f'Wrote {out} with {len(chars)} keys') - -if __name__ == '__main__': - main() - diff --git a/repro_rapidocr_onnx/scripts/prepare_keys.sh b/repro_rapidocr_onnx/scripts/prepare_keys.sh deleted file mode 100755 index 46c0f3b..0000000 --- a/repro_rapidocr_onnx/scripts/prepare_keys.sh +++ /dev/null @@ -1,40 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail - -usage() { - cat <&2; usage; exit 1;; - esac -done - -if [[ -z "$OUT" ]]; then usage; exit 1; fi - -if [[ -z "$YML" && -n "$DL" ]]; then - TMPD=$(mktemp -d) - TAR="$TMPD/el_PP-OCRv5_mobile_rec_infer.tar" - URL="https://paddle-model-ecology.bj.bcebos.com/paddlex/official_inference_model/paddle3.0.0/el_PP-OCRv5_mobile_rec_infer.tar" - echo "Downloading $URL ..." - if command -v wget >/dev/null; then wget -q "$URL" -O "$TAR"; else curl -L "$URL" -o "$TAR"; fi - mkdir -p "$TMPD/el_PP-OCRv5_mobile_rec_infer" - tar -xf "$TAR" -C "$TMPD/el_PP-OCRv5_mobile_rec_infer" - YML="$TMPD/el_PP-OCRv5_mobile_rec_infer/inference.yml" -fi - -[[ -f "$YML" ]] || { echo "Missing --yml $YML" >&2; exit 1; } - -source .venv_docling/bin/activate 2>/dev/null || true -python repro_rapidocr_onnx/scripts/extract_keys.py --in-yml "$YML" --out "$OUT" - diff --git a/repro_rapidocr_onnx/scripts/repatch_docling.sh b/repro_rapidocr_onnx/scripts/repatch_docling.sh deleted file mode 100755 index e8b7bf6..0000000 --- a/repro_rapidocr_onnx/scripts/repatch_docling.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail - -# Activate venv if present -if [[ -f .venv_docling/bin/activate ]]; then source .venv_docling/bin/activate; fi - -SITE_FILE=$(python - <<'PY' -import inspect, docling.models.rapid_ocr_model as m -print(m.__file__) -PY -) - -if [[ -z "$SITE_FILE" || ! -f "$SITE_FILE" ]]; then - echo "Cannot locate docling.models.rapid_ocr_model.py (is the venv active?)" >&2 - exit 1 -fi - -if rg -n "Rec\.keys_path" "$SITE_FILE" >/dev/null; then - sed -i "s/\"Rec.keys_path\"/\"Rec.rec_keys_path\"/" "$SITE_FILE" - echo "Patched $SITE_FILE (Rec.rec_keys_path)" -else - echo "Already patched or pattern not found in $SITE_FILE" -fi diff --git a/repro_rapidocr_onnx/scripts/run_onnx.sh b/repro_rapidocr_onnx/scripts/run_onnx.sh deleted file mode 100755 index 30aa337..0000000 --- a/repro_rapidocr_onnx/scripts/run_onnx.sh +++ /dev/null @@ -1,73 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail - -usage() { - cat <&2; usage; exit 1;; - esac -done - -[[ -f "$DET" && -f "$REC" && -f "$KEYS" && -d "$IN" ]] || { echo "Missing inputs" >&2; usage; exit 1; } -mkdir -p "$OUT" - -# Activate venv if present -if [[ -f .venv_docling/bin/activate ]]; then source .venv_docling/bin/activate; fi - -# Find packaged CLS ONNX -CLS=$(python - <<'PY' -import sys, pkgutil, pathlib -import rapidocr -base = pathlib.Path(rapidocr.__file__).parent / 'models' -cands = list(base.glob('*cls*_infer.onnx')) -for p in cands: - if 'ch_ppocr_mobile_v2.0_cls_infer.onnx' in p.name: - print(p) - break -else: - print(cands[0] if cands else '', end='') -PY -) -if [[ -z "$CLS" || ! -f "$CLS" ]]; then - echo "Could not locate packaged CLS model in rapidocr/models" >&2 - exit 1 -fi - -PYTHONPATH=$(pwd) python greek_pdf_ocr.py \ - --backend onnxruntime \ - --device "$DEV" \ - --onnx-det "$DET" \ - --onnx-rec "$REC" \ - --onnx-cls "$CLS" \ - --rec-keys "$KEYS" \ - --text-score "$SCORE" \ - --images-scale "$SCALE" \ - $( [[ $FORCE_OCR -eq 1 ]] && echo --force-ocr || echo --no-force-ocr ) \ - $( [[ $NORM -eq 1 ]] && echo --normalize-output || echo --no-normalize-output ) \ - $( [[ $FORMULA -eq 1 ]] && echo --docling-formula || true ) \ - $( [[ $CODE -eq 1 ]] && echo --docling-code || true ) \ - $( [[ -n "$FBATCH" ]] && echo --formula-batch "$FBATCH" || true ) \ - "$IN" "$OUT" diff --git a/src/glossapi/corpus/phase_extract.py b/src/glossapi/corpus/phase_extract.py index a584eaf..44b9eaf 100644 --- a/src/glossapi/corpus/phase_extract.py +++ b/src/glossapi/corpus/phase_extract.py @@ -107,7 +107,8 @@ def prime_extractor( # Configure batch/backend policy based on resolved choice if backend_choice == "docling": # Keep docling runs conservative: process one document per batch for stability - self.extractor.configure_batch_policy("docling", max_batch_files=1, prefer_safe_backend=False) + #changed batching to 5 below: + self.extractor.configure_batch_policy("docling", max_batch_files=5, prefer_safe_backend=False) else: self.extractor.configure_batch_policy("safe", max_batch_files=1, prefer_safe_backend=True) @@ -138,7 +139,7 @@ def _resolve_phase1_backend( ) needs_gpu = bool(force_ocr or formula_enrichment or code_enrichment) if choice == "auto": - choice = "docling" if needs_gpu else "safe" + choice = "docling" #removed safe option if choice == "safe" and needs_gpu: self.logger.info( "Phase-1 backend 'safe' overridden to 'docling' because OCR/math enrichment was requested."