Llama 3.2 11B Vision Instruct
A vision-capable chat LLM from Meta
Deploy Llama 3.2 11B Vision Instruct behind an API endpoint in seconds.
Deploy modelExample usage
Llama 3.2 Vision Instruct uses the same messages
format as other Llama models, but with a new image
field. Note that while multi-turn conversations are supported, the model can only process one image per generation, which is supplied at the end of the array.
1import requests
2
3# Replace the empty string with your model id below
4model_id = ""
5baseten_api_key = os.environ["BASETEN_API_KEY"]
6
7messages = [
8 {"role": "user", "content": [
9 {"type": "image"},
10 {"type": "text", "text": "Can you write a haiku about this image?"}
11 ]},
12]
13image = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg"
14data = {
15 "messages": messages,
16 "image": image,
17 "stream": True,
18 "max_new_tokens": 512,
19 "temperature": 0.9
20}
21
22# Call model endpoint
23res = requests.post(
24 f"https://model-{model_id}.api.baseten.co/production/predict",
25 headers={"Authorization": f"Api-Key {baseten_api_key}"},
26 json=data,
27 stream=True
28)
29
30# Print the generated tokens as they get streamed
31for content in res.iter_content():
32 print(content.decode("utf-8"), end="", flush=True)
1{
2 "id": "chat-b1e89c98a7294d9dbb9d5e7867d2cb7c",
3 "object": "chat.completion",
4 "created": 1727839150,
5 "model": "meta-llama/Llama-3.2-11B-Vision-Instruct",
6 "choices": [
7 {
8 "index": 0,
9 "message": {
10 "role": "assistant",
11 "content": "This image is a close-up photograph of a black Labrador puppy with floppy ears and a shiny, healthy coat, gazing up at the camera with large brown eyes.",
12 "tool_calls": []
13 },
14 "logprobs": null,
15 "finish_reason": "stop",
16 "stop_reason": null
17 }
18 ],
19 "usage": {
20 "prompt_tokens": 18,
21 "total_tokens": 52,
22 "completion_tokens": 34
23 },
24 "prompt_logprobs": null
25}