# SPDX-License-Identifier: Apache-2.0# SPDX-FileCopyrightText: Copyright contributors to the vLLM project"""This example shows how to use vLLM for running offline inference withthe explicit/implicit prompt format on enc-dec LMMs for text generation."""importosimporttimefromcollections.abcimportSequencefromdataclassesimportasdictfromtypingimportNamedTuplefromvllmimportLLM,EngineArgs,PromptType,SamplingParamsfromvllm.assets.audioimportAudioAssetfromvllm.utilsimportFlexibleArgumentParserclassModelRequestData(NamedTuple):engine_args:EngineArgsprompts:Sequence[PromptType]defrun_whisper():os.environ["VLLM_WORKER_MULTIPROC_METHOD"]="spawn"engine_args=EngineArgs(model="openai/whisper-large-v3-turbo",max_model_len=448,max_num_seqs=16,limit_mm_per_prompt={"audio":1},dtype="half",)prompts=[{# Test implicit prompt"prompt":"<|startoftranscript|>","multi_modal_data":{"audio":AudioAsset("mary_had_lamb").audio_and_sample_rate,},},{# Test explicit encoder/decoder prompt"encoder_prompt":{"prompt":"","multi_modal_data":{"audio":AudioAsset("winning_call").audio_and_sample_rate,},},"decoder_prompt":"<|startoftranscript|>",},]returnModelRequestData(engine_args=engine_args,prompts=prompts,)model_example_map={"whisper":run_whisper,}defparse_args():parser=FlexibleArgumentParser(description="Demo on using vLLM for offline inference with ""vision language models for text generation")parser.add_argument("--model-type","-m",type=str,default="whisper",choices=model_example_map.keys(),help='Huggingface "model_type".',)parser.add_argument("--seed",type=int,default=None,help="Set the seed when initializing `vllm.LLM`.",)returnparser.parse_args()defmain(args):model=args.model_typeifmodelnotinmodel_example_map:raiseValueError(f"Model type {model} is not supported.")req_data=model_example_map[model]()# Disable other modalities to save memorydefault_limits={"image":0,"video":0,"audio":0}req_data.engine_args.limit_mm_per_prompt=default_limits|dict(req_data.engine_args.limit_mm_per_promptor{})engine_args=asdict(req_data.engine_args)|{"seed":args.seed}llm=LLM(**engine_args)prompts=req_data.prompts# Create a sampling params object.sampling_params=SamplingParams(temperature=0,top_p=1.0,max_tokens=64,skip_special_tokens=False,)start=time.time()# Generate output tokens from the prompts. The output is a list of# RequestOutput objects that contain the prompt, generated# text, and other information.outputs=llm.generate(prompts,sampling_params)# Print the outputs.foroutputinoutputs:prompt=output.promptgenerated_text=output.outputs[0].textprint(f"Decoder prompt: {prompt!r}, Generated text: {generated_text!r}")duration=time.time()-startprint("Duration:",duration)print("RPS:",len(prompts)/duration)if__name__=="__main__":args=parse_args()main(args)