diff --git a/README.md b/README.md
index cdb508bdd..84521a7bc 100644
--- a/README.md
+++ b/README.md
@@ -154,6 +154,48 @@ The quantized model is sensitive to input types and CUDA handling. To avoid pote
Additional tools for evaluating OLMo models are available at the [OLMo Eval](https://github.com/allenai/OLMo-eval) repo.
+## Hosting
+
+An example script is provided for hosting an OLMo 2 model on Modal.com using a the OpenAI API in ./scripts/olmo2_modal_openai.py.
+To run that:
+
+- Follow the instructions under Getting Started in [the Modal.com Guide](https://modal.com/docs/guide) to install
+the Modal library and command line tools.
+- Follow the instructions under [Secrets](https://modal.com/docs/guide/secrets) in the Modal.com Guide to create a Modal secret named "example-secret-token"
+that defines a value for the variable MODAL_TOKEN for your server.
+- Then run
+```bash
+modal deploy ./scripts/olmo2_modal_openai.py
+```
+
+
+
+You can check your endpoint using curl similar to the following:
+```bash
+curl -X POST \
+ -H "Authorization: Bearer [the secret token from above]" \
+ -H "Content-Type: application/json" \
+ -d @body.json \
+ https://[the web endpoint modal creates above]/v1/chat/completions
+```
+
+where `body.json` is of the form:
+```
+{
+ "model": "OLMo-2-1124-13B-Instruct",
+ "messages": [
+ {
+ "role": "user",
+ "content": "Who was Alan Turing?"
+ }
+ ],
+ "max_tokens": 100,
+ "temperature": 0.9,
+ "stream": true
+}
+```
+
+
## Citing
```bibtex
diff --git a/scripts/olmo2_modal_openai.py b/scripts/olmo2_modal_openai.py
index c8687dc5c..009591c14 100644
--- a/scripts/olmo2_modal_openai.py
+++ b/scripts/olmo2_modal_openai.py
@@ -50,7 +50,7 @@ def download_model_to_image(model_dir, model_name, model_revision):
# Our first order of business is to define the environment our server will run in
# the container image. (See https://modal.com/docs/guide/custom-container)
-# This differs from vllm_interface.py in two major ways: first, as of the time this
+# This differs from vllm_inference.py in two major ways: first, as of the time this
# is being written, the OLMo 2 model architecture requires building vLLM and transformers
# from github commits that are too recent to have tagged versions, requiring that they
# be built. The nvidia/cuda base image, git and build-essential apt_install, and the
@@ -145,7 +145,7 @@ def serve():
# This example uses a token defined in the Modal secret linked above,
# as described here: https://modal.com/docs/guide/secrets
- async def is_authenticated(api_key: str = fastapi.Security(http_bearer)):
+ async def is_authenticated(api_key = fastapi.Security(http_bearer)):
if api_key.credentials != os.getenv("MODAL_TOKEN"):
raise fastapi.HTTPException(
status_code=fastapi.status.HTTP_401_UNAUTHORIZED,
@@ -207,7 +207,7 @@ async def is_authenticated(api_key: str = fastapi.Security(http_bearer)):
# To deploy the API on Modal, just run
# ```bash
-# modal deploy vllm_inference.py
+# modal deploy olmo2_modal_openai.py
# ```
# This will create a new app on Modal, build the container image for it, and deploy.
@@ -218,7 +218,7 @@ async def is_authenticated(api_key: str = fastapi.Security(http_bearer)):
# something like `https://your-workspace-name--olmo-2-1124-instruct-openai-serve.modal.run`.
# You can find [interactive Swagger UI docs](https://swagger.io/tools/swagger-ui/)
-# at the `/docs` route of that URL, i.e. `https://your-workspace-name--example-vllm-openai-compatible-serve.modal.run/docs`.
+# at the `/docs` route of that URL, i.e. `https://your-workspace-name--olmo-2-1124-instruct-openai-serve.modal.run/docs`.
# These docs describe each route and indicate the expected input and output
# and translate requests into `curl` commands. They also demonstrate authentication.