Skip to content

Commit

Permalink
Merge branch 'ggerganov:master' into master
Browse files Browse the repository at this point in the history
  • Loading branch information
sealad886 authored May 30, 2024
2 parents 3755c5d + e6157f9 commit 2c619b0
Show file tree
Hide file tree
Showing 46 changed files with 897 additions and 747 deletions.
8 changes: 8 additions & 0 deletions .devops/main-intel.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,14 @@ ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04

FROM intel/oneapi-basekit:$ONEAPI_VERSION as build

RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
rm /etc/apt/sources.list.d/intel-graphics.list && \
wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
chmod 644 /usr/share/keyrings/intel-graphics.gpg

ARG LLAMA_SYCL_F16=OFF
RUN apt-get update && \
apt-get install -y git
Expand Down
16 changes: 16 additions & 0 deletions .devops/server-intel.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,14 @@ ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04

FROM intel/oneapi-basekit:$ONEAPI_VERSION as build

RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
rm /etc/apt/sources.list.d/intel-graphics.list && \
wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
chmod 644 /usr/share/keyrings/intel-graphics.gpg

ARG LLAMA_SYCL_F16=OFF
RUN apt-get update && \
apt-get install -y git libcurl4-openssl-dev
Expand All @@ -19,6 +27,14 @@ RUN if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \

FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime

RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
rm /etc/apt/sources.list.d/intel-graphics.list && \
wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
chmod 644 /usr/share/keyrings/intel-graphics.gpg

RUN apt-get update && \
apt-get install -y libcurl4-openssl-dev

Expand Down
2 changes: 1 addition & 1 deletion .devops/tools.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ arg1="$1"
shift

if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then
python3 ./convert.py "$@"
python3 ./convert-hf-to-gguf.py "$@"
elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
./quantize "$@"
elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
Expand Down
38 changes: 0 additions & 38 deletions .github/ISSUE_TEMPLATE/06-question.yml

This file was deleted.

52 changes: 52 additions & 0 deletions .github/ISSUE_TEMPLATE/06-research.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
name: Research
description: Track new technical research area
title: "Research: "
labels: ["research 🔬"]
body:
- type: markdown
attributes:
value: |
Don't forget to check for any [duplicate research issue tickets](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3A%22research+%F0%9F%94%AC%22)
- type: checkboxes
id: research-stage
attributes:
label: Research Stage
description: Track general state of this research ticket
options:
- label: Background Research (Let's try to avoid reinventing the wheel)
- label: Hypothesis Formed (How do you think this will work and it's effect?)
- label: Strategy / Implementation Forming
- label: Analysis of results
- label: Debrief / Documentation (So people in the future can learn from us)

- type: textarea
id: background
attributes:
label: Previous existing literature and research
description: Whats the current state of the art and whats the motivation for this research?

- type: textarea
id: hypothesis
attributes:
label: Hypothesis
description: How do you think this will work and it's effect?

- type: textarea
id: implementation
attributes:
label: Implementation
description: Got an approach? e.g. a PR ready to go?

- type: textarea
id: analysis
attributes:
label: Analysis
description: How does the proposed implementation behave?

- type: textarea
id: logs
attributes:
label: Relevant log output
description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
render: shell
13 changes: 13 additions & 0 deletions .github/ISSUE_TEMPLATE/config.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
blank_issues_enabled: true
contact_links:
- name: Got an idea?
url: https://github.com/ggerganov/llama.cpp/discussions/categories/ideas
about: Pop it there. It may then become an enhancement ticket.
- name: Got a question?
url: https://github.com/ggerganov/llama.cpp/discussions/categories/q-a
about: Ask a question there!
- name: Want to contribute?
url: https://github.com/ggerganov/llama.cpp/wiki/contribute
about: Head to the contribution guide page of the wiki for areas you can help with


5 changes: 2 additions & 3 deletions .github/workflows/docker.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,8 @@ jobs:
- { tag: "light-rocm", dockerfile: ".devops/main-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
- { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
- { tag: "server-rocm", dockerfile: ".devops/server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
# TODO: Disabled due to build issues https://github.com/ggerganov/llama.cpp/issues/7507
#- { tag: "light-intel", dockerfile: ".devops/main-intel.Dockerfile", platforms: "linux/amd64" }
#- { tag: "server-intel", dockerfile: ".devops/server-intel.Dockerfile", platforms: "linux/amd64" }
- { tag: "light-intel", dockerfile: ".devops/main-intel.Dockerfile", platforms: "linux/amd64" }
- { tag: "server-intel", dockerfile: ".devops/server-intel.Dockerfile", platforms: "linux/amd64" }
steps:
- name: Check out the repo
uses: actions/checkout@v4
Expand Down
6 changes: 5 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -628,6 +628,10 @@ if (LLAMA_SYCL)
add_compile_definitions(GGML_SYCL_F16)
endif()

if (LLAMA_CUDA_FORCE_MMQ)
add_compile_definitions(GGML_SYCL_FORCE_MMQ)
endif()

add_compile_options(-I./) #include DPCT
add_compile_options(-I/${SYCL_INCLUDE_DIR})

Expand Down Expand Up @@ -1310,7 +1314,7 @@ set_target_properties(llama PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}
install(TARGETS llama LIBRARY PUBLIC_HEADER)

install(
FILES convert.py
FILES convert-hf-to-gguf.py
PERMISSIONS
OWNER_READ
OWNER_WRITE
Expand Down
10 changes: 5 additions & 5 deletions README-sycl.md
Original file line number Diff line number Diff line change
Expand Up @@ -54,10 +54,10 @@ It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS,

## OS

| OS | Status | Verified |
|---------|---------|------------------------------------|
| Linux | Support | Ubuntu 22.04, Fedora Silverblue 39 |
| Windows | Support | Windows 11 |
| OS | Status | Verified |
|---------|---------|------------------------------------------------|
| Linux | Support | Ubuntu 22.04, Fedora Silverblue 39, Arch Linux |
| Windows | Support | Windows 11 |


## Hardware
Expand All @@ -70,7 +70,7 @@ It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS,
|-------------------------------|---------|---------------------------------------|
| Intel Data Center Max Series | Support | Max 1550, 1100 |
| Intel Data Center Flex Series | Support | Flex 170 |
| Intel Arc Series | Support | Arc 770, 730M |
| Intel Arc Series | Support | Arc 770, 730M, Arc A750 |
| Intel built-in Arc GPU | Support | built-in Arc GPU in Meteor Lake |
| Intel iGPU | Support | iGPU in i5-1250P, i7-1260P, i7-1165G7 |

Expand Down
35 changes: 22 additions & 13 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -315,8 +315,6 @@ In order to build llama.cpp you have four different options.
make
```

**Note**: for `Debug` builds, run `make LLAMA_DEBUG=1`

- On Windows:

1. Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases).
Expand All @@ -328,23 +326,32 @@ In order to build llama.cpp you have four different options.
make
```

- Notes:
- For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `make -j 8` will run 8 jobs in parallel.
- For faster repeated compilation, install [ccache](https://ccache.dev/).
- For debug builds, run `make LLAMA_DEBUG=1`

- Using `CMake`:

```bash
cmake -B build
cmake --build build --config Release
```
```bash
cmake -B build
cmake --build build --config Release
```

**Notes**:

**Note**: for `Debug` builds, there are two cases:
- For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `cmake --build build --config Release -j 8` will run 8 jobs in parallel.
- For faster repeated compilation, install [ccache](https://ccache.dev/).
- For debug builds, there are two cases:

- Single-config generators (e.g. default = `Unix Makefiles`; note that they just ignore the `--config` flag):
1. Single-config generators (e.g. default = `Unix Makefiles`; note that they just ignore the `--config` flag):

```bash
cmake -B build -DCMAKE_BUILD_TYPE=Debug
cmake --build build
```

- Multi-config generators (`-G` param set to Visual Studio, XCode...):
2. Multi-config generators (`-G` param set to Visual Studio, XCode...):

```bash
cmake -B build -G "Xcode"
Expand Down Expand Up @@ -477,7 +484,8 @@ Building the program with BLAS support may lead to some performance improvements
|--------------------------------|------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| LLAMA_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
| LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
| LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. |
| LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. |
| LLAMA_CUDA_FORCE_MMQ | Boolean | false | Force the use of dequantization + matrix multiplication kernels instead of leveraging Math libraries. | |
| LLAMA_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. |
| LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
| LLAMA_CUDA_PEER_MAX_BATCH_SIZE | Positive integer | 128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. |
Expand Down Expand Up @@ -696,7 +704,8 @@ Building the program with BLAS support may lead to some performance improvements
To obtain the official LLaMA 2 weights please see the <a href="#obtaining-and-using-the-facebook-llama-2-model">Obtaining and using the Facebook LLaMA 2 model</a> section. There is also a large selection of pre-quantized `gguf` models available on Hugging Face.
Note: `convert.py` does not support LLaMA 3, you can use `convert-hf-to-gguf.py` with LLaMA 3 downloaded from Hugging Face.
Note: `convert.py` has been moved to `examples/convert-legacy-llama.py` and shouldn't be used for anything other than `Llama/Llama2/Mistral` models and their derievatives.
It does not support LLaMA 3, you can use `convert-hf-to-gguf.py` with LLaMA 3 downloaded from Hugging Face.

```bash
# obtain the official LLaMA model weights and place them in ./models
Expand All @@ -713,10 +722,10 @@ ls ./models
python3 -m pip install -r requirements.txt
# convert the model to ggml FP16 format
python3 convert.py models/mymodel/
python3 convert-hf-to-gguf.py models/mymodel/
# [Optional] for models using BPE tokenizers
python convert.py models/mymodel/ --vocab-type bpe
python convert-hf-to-gguf.py models/mymodel/ --vocab-type bpe
# quantize the model to 4-bits (using Q4_K_M method)
./quantize ./models/mymodel/ggml-model-f16.gguf ./models/mymodel/ggml-model-Q4_K_M.gguf Q4_K_M
Expand Down
2 changes: 1 addition & 1 deletion ci/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -287,7 +287,7 @@ function gg_run_open_llama_7b_v2 {
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log

python3 ../convert.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
python3 ../examples/convert-legacy-llama.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf

model_f16="${path_models}/ggml-model-f16.gguf"
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
Expand Down
4 changes: 1 addition & 3 deletions convert-hf-to-gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,6 @@
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
import gguf

from convert import LlamaHfVocab

logger = logging.getLogger("hf-to-gguf")


Expand Down Expand Up @@ -634,7 +632,7 @@ def _set_vocab_sentencepiece(self):
special_vocab.add_to_gguf(self.gguf_writer)

def _set_vocab_llama_hf(self):
vocab = LlamaHfVocab(self.dir_model)
vocab = gguf.LlamaHfVocab(self.dir_model)
tokens = []
scores = []
toktypes = []
Expand Down
2 changes: 1 addition & 1 deletion docs/HOWTO-add-model.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ Also, it is important to check that the examples and main ggml backends (CUDA, M
### 1. Convert the model to GGUF

This step is done in python with a `convert` script using the [gguf](https://pypi.org/project/gguf/) library.
Depending on the model architecture, you can use either [convert.py](../convert.py) or [convert-hf-to-gguf.py](../convert-hf-to-gguf.py).
Depending on the model architecture, you can use either [convert-hf-to-gguf.py](../convert-hf-to-gguf.py) or [examples/convert-legacy-llama.py](../examples/convert-legacy-llama.py) (for `llama/llama2` models in `.pth` format).

The convert script reads the model configuration, tokenizer, tensor names+data and converts them to GGUF metadata and tensors.

Expand Down
Loading

0 comments on commit 2c619b0

Please sign in to comment.