diff --git a/tests/model_hub_tests/torch_tests/hf_transformers_models b/tests/model_hub_tests/torch_tests/hf_transformers_models index 73e884d49ea..5c09c1d3b54 100644 --- a/tests/model_hub_tests/torch_tests/hf_transformers_models +++ b/tests/model_hub_tests/torch_tests/hf_transformers_models @@ -65,7 +65,7 @@ EleutherAI/pythia-6.9b,gpt_neox facebook/bart-large-mnli,bart facebook/convnextv2-tiny-22k-384,convnextv2 facebook/detr-resnet-50,detr -facebook/dinov2-base,dinov2,skip,Load problem +facebook/dinov2-base,dinov2,xfail,Tracing error: Please check correctness of provided example_input (but eval was correct) facebook/dpr-question_encoder-single-nq-base,dpr facebook/encodec_24khz,encodec,xfail,Unsupported op aten::lstm facebook/esm2_t6_8M_UR50D,esm @@ -168,7 +168,7 @@ HJHGJGHHG/GAU-Base-Full,gau,skip,Load problem huggingface/autoformer-tourism-monthly,autoformer,skip,Load problem huggingface/informer-tourism-monthly,informer,skip,Load problem huggingface/time-series-transformer-tourism-monthly,time_series_transformer,skip,Load problem -HuggingFaceM4/tiny-random-idefics,idefics,skip,Load problem +HuggingFaceM4/tiny-random-idefics,idefics,xfail,tracing error: Please check correctness of provided example_input (eval was correct but trace failed with incommatible tuples and tensors) HuggingFaceM4/tiny-random-vllama-clip,vllama,skip,Load problem HuggingFaceM4/tiny-random-vopt-clip,vopt,skip,Load problem HuiHuang/gpt3-damo-base-zh,gpt3,skip,Load problem @@ -243,12 +243,12 @@ microsoft/conditional-detr-resnet-50,conditional_detr microsoft/deberta-base,deberta microsoft/git-large-coco,git,skip,Load problem microsoft/layoutlm-base-uncased,layoutlm -microsoft/layoutlmv2-base-uncased,layoutlmv2,skip,Load problem +microsoft/layoutlmv2-base-uncased,layoutlmv2,xfail,Tracing error: Please check correctness of provided example_input (but eval was correct) microsoft/layoutlmv3-base,layoutlmv3 microsoft/markuplm-base,markuplm microsoft/resnet-50,resnet microsoft/speecht5_hifigan,hifigan,skip,Load problem -microsoft/speecht5_tts,speecht5,skip,Load problem +microsoft/speecht5_tts,speecht5,xfail,Tracing error: hangs with no error (probably because of infinite while inside generate) microsoft/swinv2-tiny-patch4-window8-256,swinv2 microsoft/table-transformer-detection,table-transformer microsoft/wavlm-large,wavlm,skip,Load problem @@ -317,7 +317,7 @@ sahasrarjn/interbert,BERT,skip,Load problem saibo/genkalm-medium-gpt2,genkalm,skip,Load problem SajjadAyoubi/clip-fa-vision,clip_vision_model Salesforce/blip2-flan-t5-xl,blip-2,skip,Load problem -Salesforce/blip-image-captioning-large,blip,skip,Load problem +Salesforce/blip-image-captioning-large,blip Salesforce/instructblip-vicuna-7b,instructblip,skip,Load problem SamLowe/roberta-base-go_emotions,roberta sanchit-gandhi/enhanced_direct_s2st_en_to_es,speech-to-speech,skip,Load problem diff --git a/tests/model_hub_tests/torch_tests/test_hf_transformers.py b/tests/model_hub_tests/torch_tests/test_hf_transformers.py index 24d878408a5..3c735b90aad 100644 --- a/tests/model_hub_tests/torch_tests/test_hf_transformers.py +++ b/tests/model_hub_tests/torch_tests/test_hf_transformers.py @@ -154,6 +154,94 @@ class TestTransformersModel(TestTorchConvertModel): model = VIT_GPT2_Model(model) example = (encoded_input.pixel_values,) + elif 'idefics' in mi.tags: + from transformers import IdeficsForVisionText2Text, AutoProcessor + model = IdeficsForVisionText2Text.from_pretrained(name) + processor = AutoProcessor.from_pretrained(name) + + prompts = [[ + "User: What is in this image?", + "https://upload.wikimedia.org/wikipedia/commons/8/86/Id%C3%A9fix.JPG", + "", + + "\nAssistant: This picture depicts Idefix, the dog of Obelix in Asterix and Obelix. Idefix is running on the ground.", + + "\nUser:", + "https://static.wikia.nocookie.net/asterix/images/2/25/R22b.gif/revision/latest?cb=20110815073052", + "And who is that?", + + "\nAssistant:", + ]] + + inputs = processor(prompts, add_end_of_utterance_token=False, return_tensors="pt") + exit_condition = processor.tokenizer("", add_special_tokens=False).input_ids + bad_words_ids = processor.tokenizer(["", ""], add_special_tokens=False).input_ids + + example = dict(inputs) + example.update({ + 'eos_token_id': exit_condition, + 'bad_words_ids': bad_words_ids, + }) + + class Decorator(torch.nn.Module): + def __init__(self, model): + super().__init__() + self.model = model + def forward(self, input_ids, attention_mask, pixel_values, image_attention_mask, eos_token_id, bad_words_ids): + return self.model.generate( + input_ids=input_ids, + attention_mask=attention_mask, + pixel_values=pixel_values, + image_attention_mask=image_attention_mask, + eos_token_id=eos_token_id, + bad_words_ids=bad_words_ids, + max_length=100 + ) + model = Decorator(model) + elif 'blip' in mi.tags and 'text2text-generation' in mi.tags: + from transformers import BlipProcessor, BlipForConditionalGeneration + + processor = BlipProcessor.from_pretrained(name) + model = BlipForConditionalGeneration.from_pretrained(name) + text = "a photography of" + inputs = processor(self.image, text, return_tensors="pt") + + class DecoratorForBlipForConditional(torch.nn.Module): + def __init__(self, model): + super().__init__() + self.model = model + + def forward(self, pixel_values, input_ids, attention_mask): + return self.model.generate(pixel_values, input_ids, attention_mask) + + model = DecoratorForBlipForConditional(model) + example = dict(inputs) + elif 'speecht5' in mi.tags: + from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan + from datasets import load_dataset + processor = SpeechT5Processor.from_pretrained(name) + model = SpeechT5ForTextToSpeech.from_pretrained(name) + + inputs = processor(text="Hello, my dog is cute.", return_tensors="pt") + # load xvector containing speaker's voice characteristics from a dataset + embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation") + speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0) + + example = {'input_ids': inputs["input_ids"], 'speaker_embeddings': speaker_embeddings} + class DecoratorModelForSeq2SeqLM(torch.nn.Module): + def __init__(self, model): + super().__init__() + self.model = model + def forward(self, input_ids, speaker_embeddings): + return self.model.generate_speech(input_ids=input_ids, speaker_embeddings=speaker_embeddings) #, vocoder=vocoder) + model = DecoratorModelForSeq2SeqLM(model) + elif 'layoutlmv2' in mi.tags: + from transformers import LayoutLMv2Processor + processor = LayoutLMv2Processor.from_pretrained(name) + + question = "What's the content of this image?" + encoding = processor(self.image, question, max_length=512, truncation=True, return_tensors="pt") + example = dict(encoding) elif 'pix2struct' in mi.tags: from transformers import AutoProcessor, Pix2StructForConditionalGeneration model = Pix2StructForConditionalGeneration.from_pretrained(name, **model_kwargs)