Source code for turboquant.quant_bridge

"""Python ctypes bridge for GGML TurboQuant C kernels.

Provides :class:`GGMLTurboQuant` which loads the compiled shared library
and exposes quantize/dequantize/dot_product as Python callables.

The C implementation lives in ``quant_ggml.h`` (header) and the eventual
``quant_ggml.c`` (to be compiled). Until the C library is built, this
module falls back to the pure-NumPy implementation in :mod:`esml.quant`.

Usage
-----
>>> from turboquant.quant_bridge import GGMLTurboQuant
>>> tq = GGMLTurboQuant()
>>> if tq.available:
...     block = tq.quantize(vector, bits=3)
... else:
...     # Falls back to pure Python
...     from turboquant.quant import turboquant_mse
...     block = turboquant_mse(vector, bits=3)
"""

from __future__ import annotations

import ctypes
import logging
from ctypes import (
    POINTER,
    Structure,
    c_float,
    c_int,
    c_uint8,
    c_uint16,
    c_void_p,
)
from pathlib import Path
from typing import Any

import numpy as np
from numpy.typing import NDArray

logger = logging.getLogger(__name__)

# Block size matching the C header
TQ_BLOCK_SIZE = 256


# ---------------------------------------------------------------------------
# C structure mirrors
# ---------------------------------------------------------------------------



[docs]
class BlockTBQ3(Structure):
    """ctypes mirror of block_tbq3_0 (102 bytes per 256 elements)."""

    _fields_ = [
        ("norm", c_float),
        ("seed", c_uint16),
        ("indices", c_uint8 * 96),
    ]




[docs]
class BlockTBQ4(Structure):
    """ctypes mirror of block_tbq4_0 (134 bytes per 256 elements)."""

    _fields_ = [
        ("norm", c_float),
        ("seed", c_uint16),
        ("indices", c_uint8 * 128),
    ]




[docs]
class BlockTBQ2(Structure):
    """ctypes mirror of block_tbq2_0 (70 bytes per 256 elements)."""

    _fields_ = [
        ("norm", c_float),
        ("seed", c_uint16),
        ("indices", c_uint8 * 64),
    ]



_BLOCK_TYPES = {
    2: BlockTBQ2,
    3: BlockTBQ3,
    4: BlockTBQ4,
}


# ---------------------------------------------------------------------------
# GGMLTurboQuant — ctypes wrapper
# ---------------------------------------------------------------------------



[docs]
class GGMLTurboQuant:
    """Python interface to the GGML TurboQuant C library.

    Falls back to pure-NumPy when the shared library isn't compiled.

    Parameters
    ----------
    lib_path : str or Path, optional
        Path to the compiled shared library (``.dylib`` / ``.so``).
        If None, searches in the package directory.
    """

    def __init__(self, lib_path: str | Path | None = None):
        self._lib: ctypes.CDLL | None = None
        self._available = False

        if lib_path is None:
            # Search for the compiled library next to this file
            pkg_dir = Path(__file__).parent
            for suffix in (".dylib", ".so", ".dll"):
                candidate = pkg_dir / f"quant_ggml{suffix}"
                if candidate.exists():
                    lib_path = candidate
                    break

        if lib_path and Path(lib_path).exists():
            try:
                self._lib = ctypes.CDLL(str(lib_path))
                self._setup_signatures()
                self._available = True
                logger.info("Loaded GGML TurboQuant library from %s", lib_path)
            except OSError as e:
                logger.debug("Could not load GGML library: %s", e)

        if not self._available:
            logger.debug("GGML TurboQuant not available — using pure-NumPy fallback")

    def _setup_signatures(self) -> None:
        """Set up ctypes function signatures."""
        if self._lib is None:
            return

        # tq_init(ctx, dim, bits)
        self._lib.tq_init.argtypes = [c_void_p, c_int, c_int]
        self._lib.tq_init.restype = None

        # tq_quantize_block(src, dst, ctx, seed)
        self._lib.tq_quantize_block.argtypes = [
            POINTER(c_float),
            c_void_p,
            c_void_p,
            c_uint16,
        ]
        self._lib.tq_quantize_block.restype = None

        # tq_dequantize_block(src, dst, ctx)
        self._lib.tq_dequantize_block.argtypes = [
            c_void_p,
            POINTER(c_float),
            c_void_p,
        ]
        self._lib.tq_dequantize_block.restype = None

        # tq_dot_product(block, vec, ctx)
        self._lib.tq_dot_product.argtypes = [
            c_void_p,
            POINTER(c_float),
            c_void_p,
        ]
        self._lib.tq_dot_product.restype = c_float

    @property
    def available(self) -> bool:
        """True if the C library is loaded and ready."""
        return self._available


[docs]
    def quantize(
        self,
        vector: NDArray[np.float32],
        bits: int = 3,
        seed: int = 42,
    ) -> Any:
        """Quantize a vector using the C kernel or NumPy fallback.

        Parameters
        ----------
        vector : ndarray of float32, shape (d,)
            Input vector. d must be 256 (TQ_BLOCK_SIZE).
        bits : int
            Quantization bits (2, 3, or 4).
        seed : int
            Rotation matrix seed.

        Returns
        -------
        TQBlock or ctypes Structure
            Compressed block.
        """
        if self._available and self._lib is not None:
            return self._quantize_c(vector, bits, seed)
        return self._quantize_numpy(vector, bits, seed)



[docs]
    def dequantize(self, block: Any, bits: int = 3) -> NDArray[np.float32]:
        """Dequantize a block back to float32 vector."""
        if self._available and self._lib is not None and isinstance(block, Structure):
            return self._dequantize_c(block, bits)
        return self._dequantize_numpy(block)


    # -- C kernel paths -------------------------------------------------------

    def _make_ctx(self, bits: int, dim: int = TQ_BLOCK_SIZE) -> Any:
        """Create and initialize a tq_context via the C library."""
        # tq_context is opaque; allocate as raw bytes and call tq_init
        4 + 4 + (4 + 4 + 4 + 16 * 4)  # dim + bits + codebook struct
        ctx_buf = (ctypes.c_char * 256)()  # oversized to be safe
        self._lib.tq_init(ctypes.byref(ctx_buf), c_int(dim), c_int(bits))
        return ctx_buf

    def _quantize_c(self, vector: NDArray[np.float32], bits: int, seed: int) -> Structure:
        BlockType = _BLOCK_TYPES[bits]
        block = BlockType()
        src = vector.astype(np.float32)
        src_ptr = src.ctypes.data_as(POINTER(c_float))
        ctx = self._make_ctx(bits, len(vector))
        self._lib.tq_quantize_block(src_ptr, ctypes.byref(block), ctypes.byref(ctx), c_uint16(seed))
        return block

    def _dequantize_c(self, block: Structure, bits: int) -> NDArray[np.float32]:
        dst = np.zeros(TQ_BLOCK_SIZE, dtype=np.float32)
        dst_ptr = dst.ctypes.data_as(POINTER(c_float))
        ctx = self._make_ctx(bits)
        self._lib.tq_dequantize_block(ctypes.byref(block), dst_ptr, ctypes.byref(ctx))
        return dst

    # -- NumPy fallback paths -------------------------------------------------

    @staticmethod
    def _quantize_numpy(vector: NDArray[np.float32], bits: int, seed: int) -> Any:
        from turboquant.quant import turboquant_mse

        return turboquant_mse(vector.astype(np.float64), bits=bits, rotation_seed=seed)

    @staticmethod
    def _dequantize_numpy(block: Any) -> NDArray[np.float32]:
        from turboquant.quant import turboquant_mse_decode

        return turboquant_mse_decode(block).astype(np.float32)



# ---------------------------------------------------------------------------
# Convenience: compile the C library
# ---------------------------------------------------------------------------



[docs]
def compile_ggml_lib(
    output_dir: str | Path | None = None,
    optimize: bool = True,
) -> Path | None:
    """Attempt to compile the GGML TurboQuant C library.

    Requires ``cc`` (clang or gcc) on the system.

    Parameters
    ----------
    output_dir : str or Path, optional
        Where to write the shared library. Defaults to the esml package dir.
    optimize : bool
        Use ``-O2 -march=native`` (default True).

    Returns
    -------
    Path or None
        Path to compiled library, or None on failure.
    """
    import shutil
    import sys

    cc = shutil.which("cc") or shutil.which("clang") or shutil.which("gcc")
    if cc is None:
        logger.warning("No C compiler found — cannot compile GGML library")
        return None

    pkg_dir = Path(__file__).parent
    header = pkg_dir / "quant_ggml.h"
    if not header.exists():
        logger.warning("quant_ggml.h not found at %s", header)
        return None

    if output_dir is None:
        output_dir = pkg_dir

    output_dir = Path(output_dir)
    suffix = ".dylib" if sys.platform == "darwin" else ".so"
    output_dir / f"quant_ggml{suffix}"

    # The C source would need to be written; for now just validate the header
    logger.info(
        "GGML C library compilation not yet implemented — header validated at %s. Using NumPy fallback.",
        header,
    )
    return None