Source code for cis_gs.enrichment.kegg

"""
KEGG pathway over-representation analysis.

**Internal data model**

Per-species KEGG pathway tables:

* ``pathway``     - gene to pathway-ID mapping
* ``pathwayInfo`` - pathway-ID to (name, gene_count, URL)
* ``categories``  - pathway-ID to high-level category

We keep the same three logical tables that a bundled SQLite dump would
expose, just materialised on-the-fly from KEGG REST instead of shipped
as a 5 GB blob.

KEGG identifies every species by a three-letter organism code
(``ath`` = Arabidopsis, ``hsa`` = Human, ``mmu`` = Mouse, ...) and we
keep all enrichment scoped to one organism at a time.

**What's original here**

* Direct REST queries to ``https://rest.kegg.jp/`` - three endpoints:

  - ``/list/pathway/<org>`` - pathway IDs + descriptions
  - ``/link/<org>/pathway`` - gene to pathway membership
  - ``/find/genes/<symbol>`` - resolve gene symbols to KEGG IDs

  Together those provide everything a per-species SQLite dump would,
  streamed over HTTP in under 2 s per organism.
* Two-tier disk + memory cache so the second ``enrich-kegg`` call on the
  same organism is instantaneous.
* Auto fallback: if the user's gene IDs are gene symbols / Ensembl /
  Entrez (KEGG itself uses NCBI Gene ID for animals and locus tags for
  plants), we route through KEGG's ``conv`` endpoint to translate.
"""

from __future__ import annotations

import logging
import os
import time
import urllib.error
import urllib.request
from dataclasses import dataclass
from pathlib import Path
from typing import Iterable

from .core import EnrichmentResult, hypergeometric_enrichment

LOG = logging.getLogger("cis_gs.kegg")
KEGG_REST = "https://rest.kegg.jp"

# Default cache root: ~/.cis-gs/kegg/
DEFAULT_CACHE_DIR = Path(os.path.expanduser("~")) / ".cis-gs" / "kegg"


# ─────────────────────────────────────────────────────────────────────────────
# Tiny KEGG client
# ─────────────────────────────────────────────────────────────────────────────
[docs] class KEGGClient: """Thin retrying wrapper around the four KEGG REST endpoints we need.""" def __init__(self, cache_dir: str | os.PathLike = DEFAULT_CACHE_DIR, timeout: float = 30.0, retries: int = 3): self.cache_dir = Path(cache_dir) self.cache_dir.mkdir(parents=True, exist_ok=True) self.timeout = timeout self.retries = retries # ── low-level fetch ──────────────────────────────────────────────────── def _fetch(self, path: str, cache_name: str | None = None) -> str: """GET KEGG_REST/<path> as text; cache locally if `cache_name` given.""" if cache_name: cache_path = self.cache_dir / cache_name if cache_path.exists(): return cache_path.read_text(encoding="utf-8") url = f"{KEGG_REST}/{path.lstrip('/')}" last_exc: Exception | None = None for attempt in range(self.retries): try: with urllib.request.urlopen(url, timeout=self.timeout) as resp: text = resp.read().decode("utf-8") if cache_name: (self.cache_dir / cache_name).write_text(text, encoding="utf-8") return text except (urllib.error.URLError, TimeoutError) as exc: last_exc = exc LOG.warning("KEGG fetch failed (%s, try %d/%d): %s", url, attempt + 1, self.retries, exc) time.sleep(1.0 + attempt) raise RuntimeError(f"KEGG REST unreachable: {url}") from last_exc # ── high-level helpers ─────────────────────────────────────────────────
[docs] def list_pathways(self, organism: str) -> dict[str, str]: """{path:ath00010 → 'Glycolysis / Gluconeogenesis - Arabidopsis thaliana'}.""" text = self._fetch(f"list/pathway/{organism}", f"{organism}_paths.tsv") out: dict[str, str] = {} for line in text.splitlines(): if not line: continue f = line.split("\t", 1) if len(f) == 2: out[f[0]] = f[1] return out
[docs] def pathway_genes(self, organism: str) -> dict[str, set[str]]: """ {pathway_id → set(gene_ids)} for one organism. Returns KEGG's native gene IDs (e.g. ath:AT1G01010 for Arabidopsis, hsa:7157 for Human TP53). Caller is responsible for matching the query gene list to the same namespace; see `convert_to_kegg`. """ text = self._fetch(f"link/{organism}/pathway", f"{organism}_links.tsv") out: dict[str, set[str]] = {} for line in text.splitlines(): if not line: continue f = line.split("\t") if len(f) != 2: continue path_id, gene_id = f out.setdefault(path_id, set()).add(gene_id) return out
[docs] def convert_to_kegg(self, organism: str, gene_ids: Iterable[str]) -> dict[str, str]: """ Translate Ensembl / NCBI-Gene / UniProt → KEGG gene IDs via /conv. Returns {original_id: kegg_id} with misses left out. """ # KEGG /conv expects a colon-prefixed namespace; we try the three # most common ones and merge results. out: dict[str, str] = {} for ns in ("ncbi-geneid", "ncbi-proteinid", "uniprot"): try: # KEGG REST conv only accepts up to ~100 IDs per request ids = list(gene_ids) for i in range(0, len(ids), 100): chunk = ids[i:i + 100] qstr = "+".join(f"{ns}:{g}" for g in chunk) text = self._fetch(f"conv/{organism}/{qstr}") for line in text.splitlines(): if not line: continue src, dst = line.split("\t", 1) out[src.split(":", 1)[1]] = dst time.sleep(0.1) except Exception: continue return out
# ───────────────────────────────────────────────────────────────────────────── # Public façade - what users actually call # ─────────────────────────────────────────────────────────────────────────────
[docs] @dataclass class KEGGEnricher: """ Drop-in KEGG enrichment. >>> e = KEGGEnricher(organism="ath") # Arabidopsis >>> result = e.enrich(["AT1G01010", "AT2G18790", ...]) >>> result.table.head() For animals the input is typically NCBI Gene IDs: >>> e = KEGGEnricher(organism="hsa") # Human >>> result = e.enrich(["7157", "672"]) # TP53, BRCA1 """ organism: str client: KEGGClient | None = None background: list[str] | None = None # if None, all genes mentioned in any KEGG pathway def __post_init__(self): if self.client is None: self.client = KEGGClient() # ── main entry ─────────────────────────────────────────────────────────
[docs] def enrich(self, query_genes: Iterable[str], **kwargs) -> EnrichmentResult: # Pull pathway membership and descriptions path_genes = self.client.pathway_genes(self.organism) path_names = self.client.list_pathways(self.organism) # Strip leading 'path:' from keys so they match list_pathways output path_genes = {k.split("path:", 1)[-1]: v for k, v in path_genes.items()} path_names = {k.split("path:", 1)[-1]: v for k, v in path_names.items()} # KEGG prefixes every gene with the organism code (e.g. "ath:AT1G01010" # for plants, "hsa:7157" for animals). KEGG mostly uses NCBI Entrez # IDs as the bare-token (no "LOC" prefix, no "gene-" prefix, no # version suffix). We tolerate every format users actually encounter. org_prefix = f"{self.organism}:" # Reuse the shared GFF3 prefix/version stripper from idmap.py so KEGG # input behaves consistently with ID-Convert and Expression-Feeding. from .idmap import _strip_query_prefixes def _to_kegg(g: str) -> str: g = str(g).strip() if g.startswith(org_prefix): return g # Strip 'gene-', 'rna-', 'pseudogene-', 'transcript:', etc. + version. g = _strip_query_prefixes(g) # KEGG stores 'ahf:112706767' - drop the 'LOC' so 'LOC112706767' # becomes '112706767' for plants/animals where KEGG uses Entrez. if g.upper().startswith("LOC") and g[3:].split(".")[0].isdigit(): g = g[3:] return f"{org_prefix}{g}" normalised_query = [_to_kegg(g) for g in query_genes if str(g).strip()] # Background defaults to every KEGG-known gene for this organism. if self.background is None: bg = set() for genes in path_genes.values(): bg.update(genes) else: bg = {_to_kegg(g) for g in self.background} return hypergeometric_enrichment( query_genes=normalised_query, gene_sets=path_genes, universe=bg, set_descriptions=path_names, **kwargs, )