model = AutoModelForSpeechSeq2Seq.from_pretrained("enactic/avista-large-v2", trust_remote_code=True) inputs = processor(raw_audio="path/to/audio", raw_video="path/to/video", extract_mouth=True) outputs = model.generate(**inputs, num_beams=5, max_new_tokens=256) transcription = processor.decode(outputs[0], skip_special_tokens=True)