diff --git a/README.md b/README.md index cdb508bdd..84521a7bc 100644 --- a/README.md +++ b/README.md @@ -154,6 +154,48 @@ The quantized model is sensitive to input types and CUDA handling. To avoid pote Additional tools for evaluating OLMo models are available at the [OLMo Eval](https://github.com/allenai/OLMo-eval) repo. +## Hosting + +An example script is provided for hosting an OLMo 2 model on Modal.com using a the OpenAI API in ./scripts/olmo2_modal_openai.py. +To run that: +
    +
  1. Follow the instructions under Getting Started in [the Modal.com Guide](https://modal.com/docs/guide) to install +the Modal library and command line tools.
  2. +
  3. Follow the instructions under [Secrets](https://modal.com/docs/guide/secrets) in the Modal.com Guide to create a Modal secret named "example-secret-token" +that defines a value for the variable MODAL_TOKEN for your server.
  4. +
  5. Then run +```bash +modal deploy ./scripts/olmo2_modal_openai.py +``` +
  6. +
+ +You can check your endpoint using curl similar to the following: +```bash +curl -X POST \ + -H "Authorization: Bearer [the secret token from above]" \ + -H "Content-Type: application/json" \ + -d @body.json \ + https://[the web endpoint modal creates above]/v1/chat/completions +``` + +where `body.json` is of the form: +``` +{ + "model": "OLMo-2-1124-13B-Instruct", + "messages": [ + { + "role": "user", + "content": "Who was Alan Turing?" + } + ], + "max_tokens": 100, + "temperature": 0.9, + "stream": true +} +``` + + ## Citing ```bibtex diff --git a/scripts/olmo2_modal_openai.py b/scripts/olmo2_modal_openai.py index c8687dc5c..009591c14 100644 --- a/scripts/olmo2_modal_openai.py +++ b/scripts/olmo2_modal_openai.py @@ -50,7 +50,7 @@ def download_model_to_image(model_dir, model_name, model_revision): # Our first order of business is to define the environment our server will run in # the container image. (See https://modal.com/docs/guide/custom-container) -# This differs from vllm_interface.py in two major ways: first, as of the time this +# This differs from vllm_inference.py in two major ways: first, as of the time this # is being written, the OLMo 2 model architecture requires building vLLM and transformers # from github commits that are too recent to have tagged versions, requiring that they # be built. The nvidia/cuda base image, git and build-essential apt_install, and the @@ -145,7 +145,7 @@ def serve(): # This example uses a token defined in the Modal secret linked above, # as described here: https://modal.com/docs/guide/secrets - async def is_authenticated(api_key: str = fastapi.Security(http_bearer)): + async def is_authenticated(api_key = fastapi.Security(http_bearer)): if api_key.credentials != os.getenv("MODAL_TOKEN"): raise fastapi.HTTPException( status_code=fastapi.status.HTTP_401_UNAUTHORIZED, @@ -207,7 +207,7 @@ async def is_authenticated(api_key: str = fastapi.Security(http_bearer)): # To deploy the API on Modal, just run # ```bash -# modal deploy vllm_inference.py +# modal deploy olmo2_modal_openai.py # ``` # This will create a new app on Modal, build the container image for it, and deploy. @@ -218,7 +218,7 @@ async def is_authenticated(api_key: str = fastapi.Security(http_bearer)): # something like `https://your-workspace-name--olmo-2-1124-instruct-openai-serve.modal.run`. # You can find [interactive Swagger UI docs](https://swagger.io/tools/swagger-ui/) -# at the `/docs` route of that URL, i.e. `https://your-workspace-name--example-vllm-openai-compatible-serve.modal.run/docs`. +# at the `/docs` route of that URL, i.e. `https://your-workspace-name--olmo-2-1124-instruct-openai-serve.modal.run/docs`. # These docs describe each route and indicate the expected input and output # and translate requests into `curl` commands. They also demonstrate authentication.