Building the Reference Model

We start by importing Whisper.cpp into our Bazel build environment. We may eventually want a full fork of the code, but not until we are sure of the base release and what directions that fork will need to take.

This import starts with a simple addition to our MODULE.bazel workspace file:

# MODULE.bazel
http_archive = use_repo_rule("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
# whisper.cpp is an open source voice-to-text inference app built on Meta's LLaMA model.
# It is a useful exemplar of autovectorization of ML code with some examples of hand-coded
# riscv intrinsics.
http_archive(
    name = "whisper_cpp",
    urls = ["https://github.com/ggerganov/whisper.cpp/archive/refs/tags/v1.7.1.tar.gz"],
    strip_prefix = "whisper.cpp-1.7.1/",
    build_file = "//:whisper-cpp.BUILD",
    sha256 = "97f19a32212f2f215e538ee37a16ff547aaebc54817bd8072034e02466ce6d55"
)

Next we add whisper-cpp.BUILD to show how to build libraries and binaries. The instructions for whisper library include these stanzas:

c_library(
    name = "whisper",
    srcs = [
        "ggml/src/ggml.c",
        "ggml/src/ggml-aarch64.c",
        "ggml/src/ggml-alloc.c",
        "ggml/src/ggml-backend.cpp",
        "ggml/src/ggml-backend-impl.h",
        "ggml/src/ggml-impl.h",
        "ggml/src/ggml-quants.c",
        "src/whisper.cpp",
    ],
    copts = [
        "-I%s/include" % EXTERNAL_PATH,
        "-I%s/ggml/include" % EXTERNAL_PATH,
        "-I%s/ggml/src" % EXTERNAL_PATH,
        "-pthread",
        "-O3",
        "-ffast-math",
    ],
    ...
    defines = [
        "NDEBUG",
        "_XOPEN_SOURCE=600",
        "_GNU_SOURCE",
        "__FINITE_MATH_ONLY__=0",
        "__riscv_v_intrinsic=0",
    ],
    ...
)
cc_binary(
    name = "main",
    srcs = [
        "examples/common.cpp",
        "examples/common.h",
        "examples/common-ggml.cpp",
        "examples/common-ggml.h",
        "examples/dr_wav.h",
        "examples/grammar-parser.cpp",
        "examples/grammar-parser.h",
        "examples/main/main.cpp",
    ],
    ...
        deps = [
        "whisper",
    ],
)

Now we can build the reference app using our existing RISCV-64 toolchain:

$ bazel build --platforms=//platforms:riscv_gcc --copt='-march=rv64gcv' @whisper_cpp//:main
...
$ file bazel-bin/external/+_repo_rules+whisper_cpp/main
bazel-bin/external/+_repo_rules+whisper_cpp/main: ELF 64-bit LSB executable, UCB RISC-V, RVC, double-float ABI, version 1 (GNU/Linux), dynamically linked, interpreter /lib/ld-linux-riscv64-lp64d.so.1, for GNU/Linux 4.15.0, not stripped

$ readelf -A bazel-bin/external/+_repo_rules+whisper_cpp/main
Attribute Section: riscv
File Attributes
  Tag_RISCV_stack_align: 16-bytes
  Tag_RISCV_arch: "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_v1p0_zicsr2p0_zifencei2p0_zmmul1p0_zaamo1p0_zalrsc1p0_zca1p0_zcd1p0_zve32f1p0_zve32x1p0_zve64d1p0_zve64f1p0_zve64x1p0_zvl128b1p0_zvl32b1p0_zvl64b1p0"
  Tag_RISCV_priv_spec: 1
  Tag_RISCV_priv_spec_minor: 11

The final step is to locate the toolchain libraries used in this build, so that we can load them into Ghidra. They are usually cached in a per-user location. We’ll search for the RISCV libstdc++ toolchain library:

$ bazel info
...
output_base: /run/user/1000/bazel
output_path: /run/user/1000/bazel/execroot/_main/bazel-out
package_path: %workspace%
release: release 7.4.0
...
$ find /run/user/1000 -name libstdc++\*
...
/run/user/1000/bazel/external/gcc_riscv_suite+/riscv64-unknown-linux-gnu/lib/libstdc++.so.6.0.33
...
/run/user/1000/bazel/external/gcc_riscv_suite+/riscv64-unknown-linux-gnu/lib/libstdc++.so.6
/run/user/1000/bazel/external/gcc_riscv_suite+/riscv64-unknown-linux-gnu/lib/libstdc++.so

We will want to load libstdc++.so.6 into Ghidra before we load the reference app.