llama.cpp/Makefile

# Define the default target now so that it is always the first target
default: main quantize quantize-stats perplexity embedding vdot

ifndef UNAME_S
UNAME_S := $(shell uname -s)
endif

ifndef UNAME_P
UNAME_P := $(shell uname -p)
endif

ifndef UNAME_M
UNAME_M := $(shell uname -m)
endif

CCV := $(shell $(CC) --version | head -n 1)
CXXV := $(shell $(CXX) --version | head -n 1)

# Mac OS + Arm can report x86_64
# ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789
ifeq ($(UNAME_S),Darwin)
	ifneq ($(UNAME_P),arm)
		SYSCTL_M := $(shell sysctl -n hw.optional.arm64 2>/dev/null)
		ifeq ($(SYSCTL_M),1)
			# UNAME_P := arm
			# UNAME_M := arm64
			warn := $(warning Your arch is announced as x86_64, but it seems to actually be ARM64. Not fixing that can lead to bad performance. For more info see: https://github.com/ggerganov/whisper.cpp/issues/66\#issuecomment-1282546789)
		endif
	endif
endif

#
# Compile flags
#

# keep standard at C11 and C++11
CFLAGS   = -I.              -O3 -DNDEBUG -std=c11   -fPIC
CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC
LDFLAGS  =

# warnings
CFLAGS   += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith
CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar

# OS specific
# TODO: support Windows
ifeq ($(UNAME_S),Linux)
	CFLAGS   += -pthread
	CXXFLAGS += -pthread
endif
ifeq ($(UNAME_S),Darwin)
	CFLAGS   += -pthread
	CXXFLAGS += -pthread
endif
ifeq ($(UNAME_S),FreeBSD)
	CFLAGS   += -pthread
	CXXFLAGS += -pthread
endif
ifeq ($(UNAME_S),NetBSD)
	CFLAGS   += -pthread
	CXXFLAGS += -pthread
endif
ifeq ($(UNAME_S),OpenBSD)
	CFLAGS   += -pthread
	CXXFLAGS += -pthread
endif
ifeq ($(UNAME_S),Haiku)
	CFLAGS   += -pthread
	CXXFLAGS += -pthread
endif

# Architecture specific
# TODO: probably these flags need to be tweaked on some architectures
#       feel free to update the Makefile for your architecture and send a pull request or issue
ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686))
	# Use all CPU extensions that are available:
	CFLAGS   += -march=native -mtune=native
	CXXFLAGS += -march=native -mtune=native

	# Usage AVX-only
	#CFLAGS   += -mfma -mf16c -mavx
	#CXXFLAGS += -mfma -mf16c -mavx
endif
ifneq ($(filter ppc64%,$(UNAME_M)),)
	POWER9_M := $(shell grep "POWER9" /proc/cpuinfo)
	ifneq (,$(findstring POWER9,$(POWER9_M)))
		CFLAGS   += -mcpu=power9
		CXXFLAGS += -mcpu=power9
	endif
	# Require c++23's std::byteswap for big-endian support.
	ifeq ($(UNAME_M),ppc64)
		CXXFLAGS += -std=c++23 -DGGML_BIG_ENDIAN
	endif
endif
ifndef LLAMA_NO_ACCELERATE
	# Mac M1 - include Accelerate framework.
	# `-framework Accelerate` works on Mac Intel as well, with negliable performance boost (as of the predict time).
	ifeq ($(UNAME_S),Darwin)
		CFLAGS  += -DGGML_USE_ACCELERATE
		LDFLAGS += -framework Accelerate
	endif
endif
ifdef LLAMA_OPENBLAS
	CFLAGS  += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas
	LDFLAGS += -lopenblas
endif
ifdef LLAMA_CUBLAS
	CFLAGS    += -DGGML_USE_CUBLAS -I/usr/local/cuda/include
	LDFLAGS   += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64
	OBJS      += ggml-cuda.o
	NVCC      = nvcc
	NVCCFLAGS = --forward-unknown-to-host-linker -arch=native
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
	$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -c $< -o $@
endif
ifdef LLAMA_GPROF
	CFLAGS   += -pg
	CXXFLAGS += -pg
endif
ifdef LLAMA_PERF
	CFLAGS   += -DGGML_PERF
	CXXFLAGS += -DGGML_PERF
endif
ifneq ($(filter aarch64%,$(UNAME_M)),)
	CFLAGS   += -mcpu=native
	CXXFLAGS += -mcpu=native
endif
ifneq ($(filter armv6%,$(UNAME_M)),)
	# Raspberry Pi 1, 2, 3
	CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access
endif
ifneq ($(filter armv7%,$(UNAME_M)),)
	# Raspberry Pi 4
	CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
endif
ifneq ($(filter armv8%,$(UNAME_M)),)
	# Raspberry Pi 4
	CFLAGS += -mfp16-format=ieee -mno-unaligned-access
endif

#
# Print build information
#

$(info I llama.cpp build info: )
$(info I UNAME_S:  $(UNAME_S))
$(info I UNAME_P:  $(UNAME_P))
$(info I UNAME_M:  $(UNAME_M))
$(info I CFLAGS:   $(CFLAGS))
$(info I CXXFLAGS: $(CXXFLAGS))
$(info I LDFLAGS:  $(LDFLAGS))
$(info I CC:       $(CCV))
$(info I CXX:      $(CXXV))
$(info )

#
# Build library
#

ggml.o: ggml.c ggml.h
	$(CC)  $(CFLAGS)   -c $< -o $@

llama.o: llama.cpp ggml.h llama.h llama_util.h
	$(CXX) $(CXXFLAGS) -c $< -o $@

common.o: examples/common.cpp examples/common.h
	$(CXX) $(CXXFLAGS) -c $< -o $@

clean:
	rm -vf *.o main quantize quantize-stats perplexity embedding benchmark-q4_0-matmult

main: examples/main/main.cpp ggml.o llama.o common.o $(OBJS)
	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
	@echo
	@echo '====  Run ./main -h for help.  ===='
	@echo

quantize: examples/quantize/quantize.cpp ggml.o llama.o $(OBJS)
	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)

quantize-stats: examples/quantize-stats/quantize-stats.cpp ggml.o llama.o $(OBJS)
	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)

perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o $(OBJS)
	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)

embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o $(OBJS)
	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)

vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)

libllama.so: llama.o ggml.o $(OBJS)
	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)

#
# Tests
#

benchmark: examples/benchmark/benchmark-q4_0-matmult.c ggml.o $(OBJS)
	$(CXX) $(CXXFLAGS) $^ -o benchmark-q4_0-matmult $(LDFLAGS)
	./benchmark-q4_0-matmult

.PHONY: tests
tests:
	bash ./tests/run-tests.sh
Improve cuBLAS performance by dequantizing on the GPU (#1065) 1 year ago			`# Define the default target now so that it is always the first target`
			`default: main quantize quantize-stats perplexity embedding vdot`

Initial release 1 year ago			`ifndef UNAME_S`
			`UNAME_S := $(shell uname -s)`
			`endif`

			`ifndef UNAME_P`
			`UNAME_P := $(shell uname -p)`
			`endif`

			`ifndef UNAME_M`
			`UNAME_M := $(shell uname -m)`
			`endif`

			`CCV := $(shell $(CC) --version \| head -n 1)`
			`CXXV := $(shell $(CXX) --version \| head -n 1)`

			`# Mac OS + Arm can report x86_64`
			`# ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789`
			`ifeq ($(UNAME_S),Darwin)`
			`ifneq ($(UNAME_P),arm)`
Makefile: slightly cleanup for Mac Intel; echo instead of run ./main -h (#335) 1 year ago			`SYSCTL_M := $(shell sysctl -n hw.optional.arm64 2>/dev/null)`
Initial release 1 year ago			`ifeq ($(SYSCTL_M),1)`
			`# UNAME_P := arm`
			`# UNAME_M := arm64`
			`warn := $(warning Your arch is announced as x86_64, but it seems to actually be ARM64. Not fixing that can lead to bad performance. For more info see: https://github.com/ggerganov/whisper.cpp/issues/66\#issuecomment-1282546789)`
			`endif`
			`endif`
			`endif`

			`#`
			`# Compile flags`
			`#`

Add tokenizer test + revert to C++11 (#355) * Add test-tokenizer-0 to do a few tokenizations - feel free to expand * Added option to convert-pth-to-ggml.py script to dump just the vocabulary * Added ./models/ggml-vocab.bin containing just LLaMA vocab data (used for tests) * Added utility to load vocabulary file from previous point (temporary implementation) * Avoid using std::string_view and drop back to C++11 (hope I didn't break something) * Rename gpt_vocab -> llama_vocab * All CMake binaries go into ./bin/ now 1 year ago			`# keep standard at C11 and C++11`
Initial release 1 year ago			`CFLAGS = -I. -O3 -DNDEBUG -std=c11 -fPIC`
Add tokenizer test + revert to C++11 (#355) * Add test-tokenizer-0 to do a few tokenizations - feel free to expand * Added option to convert-pth-to-ggml.py script to dump just the vocabulary * Added ./models/ggml-vocab.bin containing just LLaMA vocab data (used for tests) * Added utility to load vocabulary file from previous point (temporary implementation) * Avoid using std::string_view and drop back to C++11 (hope I didn't break something) * Rename gpt_vocab -> llama_vocab * All CMake binaries go into ./bin/ now 1 year ago			`CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC`
Initial release 1 year ago			`LDFLAGS =`

all : be more strict about converting float to double (#458) * Be more strict about converting float to double * Test equivalence of round, SILU implementations Test module is commented out in CMakeLists.txt because the tests may take a long time, depending on how much the compiler optimizes. * Fix softmax in perplexity.cpp * all : prefer float over double where appropriate * perplexity : add <cmath> --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> 1 year ago			`# warnings`
ggml : Q4 cleanup - remove 4-bit dot product code (#1061) * Q4 cleanup * Remove unused AVX512 Q4_0 code 1 year ago			`CFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith`
Rewrite loading code to try to satisfy everyone: - Support all three formats (ggml, ggmf, ggjt). (However, I didn't include the hack needed to support GPT4All files without conversion. Those can still be used after converting them with convert.py from my other PR.) - Support both mmap and read (mmap is used by default, but can be disabled with `--no-mmap`, and is automatically disabled for pre-ggjt files or on platforms where mmap is not supported). - Support multi-file models like before, but automatically determine the number of parts rather than requiring `--n_parts`. - Improve validation and error checking. - Stop using the per-file type field (f16) entirely in favor of just relying on the per-tensor type/size fields. This has no immediate benefit, but makes it easier to experiment with different formats, and should make it easier to support the new GPTQ-for-LLaMa models in the future (I have some work in progress on that front). - Support VirtualLock on Windows (using the same `--mlock` option as on Unix). - Indicate loading progress when using mmap + mlock. (Which led me to the interesting observation that on my Linux machine, with a warm file cache, mlock actually takes some time, whereas mmap without mlock starts almost instantly...) - To help implement this, move mlock support from ggml to the loading code. - madvise/PrefetchVirtualMemory support (based on #740) - Switch from ifstream to the `fopen` family of functions to avoid unnecessary copying and, when mmap is enabled, allow reusing the same file descriptor for both metadata reads and mmap (whereas the existing implementation opens the file a second time to mmap). - Quantization now produces a single-file output even with multi-file inputs (not really a feature as much as 'it was easier this way'). Implementation notes: I tried to factor the code into more discrete pieces than before. Regarding code style: I tried to follow the code style, but I'm naughty and used a few advanced C++ features repeatedly: - Destructors to make it easier to ensure everything gets cleaned up. - Exceptions. I don't even usually use exceptions when writing C++, and I can remove them if desired... but here they make the loading code much more succinct while still properly handling a variety of errors, ranging from API calls failing to integer overflow and allocation failure. The exceptions are converted to error codes at the API boundary.) Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from #740) 1 year ago			`CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar`
all : be more strict about converting float to double (#458) * Be more strict about converting float to double * Test equivalence of round, SILU implementations Test module is commented out in CMakeLists.txt because the tests may take a long time, depending on how much the compiler optimizes. * Fix softmax in perplexity.cpp * all : prefer float over double where appropriate * perplexity : add <cmath> --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> 1 year ago
Initial release 1 year ago			`# OS specific`
			`# TODO: support Windows`
			`ifeq ($(UNAME_S),Linux)`
			`CFLAGS += -pthread`
			`CXXFLAGS += -pthread`
			`endif`
			`ifeq ($(UNAME_S),Darwin)`
			`CFLAGS += -pthread`
			`CXXFLAGS += -pthread`
			`endif`
			`ifeq ($(UNAME_S),FreeBSD)`
			`CFLAGS += -pthread`
			`CXXFLAGS += -pthread`
			`endif`
Add NetBSD support. (#90) 1 year ago			`ifeq ($(UNAME_S),NetBSD)`
			`CFLAGS += -pthread`
			`CXXFLAGS += -pthread`
			`endif`
Add OpenBSD support (#314) 1 year ago			`ifeq ($(UNAME_S),OpenBSD)`
			`CFLAGS += -pthread`
			`CXXFLAGS += -pthread`
			`endif`
Initial release 1 year ago			`ifeq ($(UNAME_S),Haiku)`
			`CFLAGS += -pthread`
			`CXXFLAGS += -pthread`
			`endif`

			`# Architecture specific`
			`# TODO: probably these flags need to be tweaked on some architectures`
			`# feel free to update the Makefile for your architecture and send a pull request or issue`
			`ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686))`
make : use -march=native -mtune=native on x86 (#609) 1 year ago			`# Use all CPU extensions that are available:`
ggml : fix AVX build + update to new Q8_0 format 1 year ago			`CFLAGS += -march=native -mtune=native`
make : missing host optimizations in CXXFLAGS (#763) 1 year ago			`CXXFLAGS += -march=native -mtune=native`
ggml : fix AVX build + update to new Q8_0 format 1 year ago
			`# Usage AVX-only`
			`#CFLAGS += -mfma -mf16c -mavx`
			`#CXXFLAGS += -mfma -mf16c -mavx`
Initial release 1 year ago			`endif`
			`ifneq ($(filter ppc64%,$(UNAME_M)),)`
			`POWER9_M := $(shell grep "POWER9" /proc/cpuinfo)`
			`ifneq (,$(findstring POWER9,$(POWER9_M)))`
ggml : fix AVX build + update to new Q8_0 format 1 year ago			`CFLAGS += -mcpu=power9`
additional optimizations for POWER9 (#454) 1 year ago			`CXXFLAGS += -mcpu=power9`
Initial release 1 year ago			`endif`
			`# Require c++23's std::byteswap for big-endian support.`
			`ifeq ($(UNAME_M),ppc64)`
			`CXXFLAGS += -std=c++23 -DGGML_BIG_ENDIAN`
			`endif`
			`endif`
Update Makefile var + add comment 1 year ago			`ifndef LLAMA_NO_ACCELERATE`
Makefile: slightly cleanup for Mac Intel; echo instead of run ./main -h (#335) 1 year ago			`# Mac M1 - include Accelerate framework.`
			# `-framework Accelerate` works on Mac Intel as well, with negliable performance boost (as of the predict time).
Initial release 1 year ago			`ifeq ($(UNAME_S),Darwin)`
			`CFLAGS += -DGGML_USE_ACCELERATE`
			`LDFLAGS += -framework Accelerate`
			`endif`
			`endif`
Update Makefile var + add comment 1 year ago			`ifdef LLAMA_OPENBLAS`
Initial release 1 year ago			`CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas`
			`LDFLAGS += -lopenblas`
			`endif`
Add NVIDIA cuBLAS support (#1044) 1 year ago			`ifdef LLAMA_CUBLAS`
Improve cuBLAS performance by using a memory pool (#1094) * Improve cuBLAS performance by using a memory pool * Move cuda specific definitions to ggml-cuda.h/cu * Add CXX flags to nvcc * Change memory pool synchronization mechanism to a spin lock General code cleanup 1 year ago			`CFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include`
			`LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64`
			`OBJS += ggml-cuda.o`
			`NVCC = nvcc`
			`NVCCFLAGS = --forward-unknown-to-host-linker -arch=native`
Improve cuBLAS performance by dequantizing on the GPU (#1065) 1 year ago			`ggml-cuda.o: ggml-cuda.cu ggml-cuda.h`
Improve cuBLAS performance by using a memory pool (#1094) * Improve cuBLAS performance by using a memory pool * Move cuda specific definitions to ggml-cuda.h/cu * Add CXX flags to nvcc * Change memory pool synchronization mechanism to a spin lock General code cleanup 1 year ago			`$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -c $< -o $@`
Add NVIDIA cuBLAS support (#1044) 1 year ago			`endif`
Update Makefile var + add comment 1 year ago			`ifdef LLAMA_GPROF`
Initial release 1 year ago			`CFLAGS += -pg`
			`CXXFLAGS += -pg`
			`endif`
ggml : better PERF prints + support "LLAMA_PERF=1 make" 1 year ago			`ifdef LLAMA_PERF`
			`CFLAGS += -DGGML_PERF`
			`CXXFLAGS += -DGGML_PERF`
			`endif`
Initial release 1 year ago			`ifneq ($(filter aarch64%,$(UNAME_M)),)`
ggml : fix AVX build + update to new Q8_0 format 1 year ago			`CFLAGS += -mcpu=native`
Initial release 1 year ago			`CXXFLAGS += -mcpu=native`
			`endif`
			`ifneq ($(filter armv6%,$(UNAME_M)),)`
			`# Raspberry Pi 1, 2, 3`
			`CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access`
			`endif`
			`ifneq ($(filter armv7%,$(UNAME_M)),)`
			`# Raspberry Pi 4`
			`CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations`
			`endif`
			`ifneq ($(filter armv8%,$(UNAME_M)),)`
			`# Raspberry Pi 4`
			`CFLAGS += -mfp16-format=ieee -mno-unaligned-access`
			`endif`

			`#`
			`# Print build information`
			`#`

			`$(info I llama.cpp build info: )`
			`$(info I UNAME_S: $(UNAME_S))`
			`$(info I UNAME_P: $(UNAME_P))`
			`$(info I UNAME_M: $(UNAME_M))`
			`$(info I CFLAGS: $(CFLAGS))`
			`$(info I CXXFLAGS: $(CXXFLAGS))`
			`$(info I LDFLAGS: $(LDFLAGS))`
			`$(info I CC: $(CCV))`
			`$(info I CXX: $(CXXV))`
			`$(info )`

			`#`
			`# Build library`
			`#`

			`ggml.o: ggml.c ggml.h`
make : fix dependencies, use auto variables (#983) 1 year ago			`$(CC) $(CFLAGS) -c $< -o $@`
Initial release 1 year ago
make : fix dependencies, use auto variables (#983) 1 year ago			`llama.o: llama.cpp ggml.h llama.h llama_util.h`
			`$(CXX) $(CXXFLAGS) -c $< -o $@`
Introduce C-style API (#370) * Major refactoring - introduce C-style API * Clean up * Add <cassert> * Add <iterator> * Add <algorithm> .... * Fix timing reporting and accumulation * Measure eval time only for single-token calls * Change llama_tokenize return meaning 1 year ago
Overhaul the examples structure - main -> examples - utils -> examples (renamed to "common") - quantize -> examples - separate tools for "perplexity" and "embedding" Hope I didn't break something ! 1 year ago			`common.o: examples/common.cpp examples/common.h`
make : fix dependencies, use auto variables (#983) 1 year ago			`$(CXX) $(CXXFLAGS) -c $< -o $@`
Initial release 1 year ago
			`clean:`
benchmark : add tool for timing q4_0 matrix multiplication (#653) * Initial version of q4_0 matrix multiplication benchmark * Bugfix: Added dependency to ggml.o to benchmark * Reviewer requests: added parameter for threads, switched to ggml_time_us() * Reviewer input: removed rtsc, use epsilon for check * Review comment: Removed set_locale * Feature: Param for numer of iterations, Bugfix for use of parameter threads * Reviewer suggestion: Moved to examples * Reviewer feedback: Updated clean: and benchmark: sections --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> 1 year ago			`rm -vf *.o main quantize quantize-stats perplexity embedding benchmark-q4_0-matmult`
Initial release 1 year ago
Improve cuBLAS performance by dequantizing on the GPU (#1065) 1 year ago			`main: examples/main/main.cpp ggml.o llama.o common.o $(OBJS)`
make : fix dependencies, use auto variables (#983) 1 year ago			`$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)`
Fix Makefile echo escape codes (by removing them). (#418) 1 year ago			`@echo`
			`@echo '==== Run ./main -h for help. ===='`
			`@echo`
Initial release 1 year ago
Improve cuBLAS performance by dequantizing on the GPU (#1065) 1 year ago			`quantize: examples/quantize/quantize.cpp ggml.o llama.o $(OBJS)`
make : fix dependencies, use auto variables (#983) 1 year ago			`$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)`
Overhaul the examples structure - main -> examples - utils -> examples (renamed to "common") - quantize -> examples - separate tools for "perplexity" and "embedding" Hope I didn't break something ! 1 year ago
Improve cuBLAS performance by dequantizing on the GPU (#1065) 1 year ago			`quantize-stats: examples/quantize-stats/quantize-stats.cpp ggml.o llama.o $(OBJS)`
make : fix dependencies, use auto variables (#983) 1 year ago			`$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)`
Add quantize-stats command for testing quantization (#728) Command that calculates some statistics over the errors introduced by quantization, like mean square error, max error and some percentile errors for layer weights. Should be useful for testing quantization improvements. Exposes some internal state from ggml and llama for testing 1 year ago
Improve cuBLAS performance by dequantizing on the GPU (#1065) 1 year ago			`perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o $(OBJS)`
make : fix dependencies, use auto variables (#983) 1 year ago			`$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)`
Initial release 1 year ago
Improve cuBLAS performance by dequantizing on the GPU (#1065) 1 year ago			`embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o $(OBJS)`
make : fix dependencies, use auto variables (#983) 1 year ago			`$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)`
Add embedding example to Makefile (#540) 1 year ago
Improve cuBLAS performance by dequantizing on the GPU (#1065) 1 year ago			`vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)`
Adding a simple program to measure speed of dot products (#1041) On my Mac, the direct Q4_1 product is marginally slower (~69 vs ~55 us for Q4_0). The SIMD-ified ggml version is now almost 2X slower (~121 us). On a Ryzen 7950X CPU, the direct product for Q4_1 quantization is faster than the AVX2 implementation (~60 vs ~62 us). --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com> 1 year ago			`$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)`

Improve cuBLAS performance by dequantizing on the GPU (#1065) 1 year ago			`libllama.so: llama.o ggml.o $(OBJS)`
make : fix dependencies, use auto variables (#983) 1 year ago			`$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)`
fix whitespace (#944) 1 year ago
Initial release 1 year ago			`#`
			`# Tests`
			`#`

Improve cuBLAS performance by dequantizing on the GPU (#1065) 1 year ago			`benchmark: examples/benchmark/benchmark-q4_0-matmult.c ggml.o $(OBJS)`
make : fix dependencies, use auto variables (#983) 1 year ago			`$(CXX) $(CXXFLAGS) $^ -o benchmark-q4_0-matmult $(LDFLAGS)`
benchmark : add tool for timing q4_0 matrix multiplication (#653) * Initial version of q4_0 matrix multiplication benchmark * Bugfix: Added dependency to ggml.o to benchmark * Reviewer requests: added parameter for threads, switched to ggml_time_us() * Reviewer input: removed rtsc, use epsilon for check * Review comment: Removed set_locale * Feature: Param for numer of iterations, Bugfix for use of parameter threads * Reviewer suggestion: Moved to examples * Reviewer feedback: Updated clean: and benchmark: sections --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> 1 year ago			`./benchmark-q4_0-matmult`
fix whitespace (#944) 1 year ago
Initial release 1 year ago			`.PHONY: tests`
			`tests:`
			`bash ./tests/run-tests.sh`