#!/usr/bin/env bash
set -euo pipefail

# Run this script on the remote NVIDIA host.  It builds llama.cpp with CUDA,
# copies an already-pulled Ollama GGUF blob onto the host filesystem, and starts
# llama-server for the benchmark. Keep HOST at 127.0.0.1 for SSH tunneling, or
# set HOST to another bind address only for a temporary private-network test.

MODEL="${1:-qwen3-coder:30b}"
CTX_SIZE="${CTX_SIZE:-32768}"
PORT="${PORT:-30000}"
HOST="${HOST:-127.0.0.1}"
SRC_DIR="${SRC_DIR:-$HOME/src/llama.cpp}"
MODEL_DIR="${MODEL_DIR:-$HOME/models/ollama-gguf}"
LOG_DIR="${LOG_DIR:-$HOME/logs}"

if docker ps >/dev/null 2>&1; then
  DOCKER=(docker)
else
  DOCKER=(sudo docker)
fi

sudo apt-get update
sudo apt-get install -y git cmake build-essential pkg-config libcurl4-openssl-dev ccache

mkdir -p "$HOME/src" "$MODEL_DIR" "$LOG_DIR"
if [ ! -d "$SRC_DIR/.git" ]; then
  git clone https://github.com/ggml-org/llama.cpp "$SRC_DIR"
fi

cd "$SRC_DIR"
git pull --ff-only
cmake -B build -S . \
  -DGGML_CUDA=ON \
  -DLLAMA_CURL=ON \
  -DCMAKE_BUILD_TYPE=Release
cmake --build build --config Release -j"$(nproc)"

export MODEL MODEL_DIR
export DOCKER_CMD="${DOCKER[*]}"
MODEL_PATH="$(python3 - <<'PY'
import json
import os
import pathlib
import subprocess

docker = os.environ["DOCKER_CMD"].split()
model = os.environ["MODEL"]
model_dir = pathlib.Path(os.environ["MODEL_DIR"]).expanduser()
repo, tag = model.split(":", 1)
manifest = f"/root/.ollama/models/manifests/registry.ollama.ai/library/{repo}/{tag}"
raw = subprocess.check_output(docker + ["exec", "ollama", "cat", manifest])
data = json.loads(raw)
layer = max(data["layers"], key=lambda item: item.get("size", 0))
blob = layer["digest"].replace(":", "-")
out = model_dir / f"{repo}-{tag}.gguf"
model_dir.mkdir(parents=True, exist_ok=True)
if (not out.exists()) or out.stat().st_size != layer.get("size"):
    subprocess.check_call(docker + ["cp", f"ollama:/root/.ollama/models/blobs/{blob}", str(out)])
print(out)
PY
)"

pkill -f "llama-server.*--port ${PORT}" >/dev/null 2>&1 || true

SERVER_BIN="$SRC_DIR/build/bin/llama-server"
LOG_FILE="$LOG_DIR/llama-server-${MODEL//[:\/]/-}.log"
echo "Starting $SERVER_BIN"
echo "Model: $MODEL_PATH"
echo "URL: http://${HOST}:${PORT}/v1/chat/completions"
echo "Log: $LOG_FILE"

exec "$SERVER_BIN" \
  --model "$MODEL_PATH" \
  --host "$HOST" \
  --port "$PORT" \
  --ctx-size "$CTX_SIZE" \
  --n-gpu-layers 999 \
  --jinja \
  --metrics \
  2>&1 | tee "$LOG_FILE"