From d5d9fd11b38481927171c6ee3e7d9d6b5b713d79 Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@intel.com>
Date: Thu, 30 Nov 2023 15:37:41 +0100
Subject: [PATCH] [tests] resolve skipped HF tests: 1st batch (#21305)

* initial

* some corrections for the first batch

* corrected classes for generate, corrected xfail

* leave only models with correct example input

* remove an obsolete line

* Update tests/model_hub_tests/torch_tests/test_hf_transformers.py

Co-authored-by: Maxim Vafin <maxim.vafin@intel.com>

* update musicgen

* cleanup test_hf_transformers.py

* typo fix

* Update tests/model_hub_tests/torch_tests/test_hf_transformers.py

* Update tests/model_hub_tests/torch_tests/test_hf_transformers.py

* move to up: corrected xfail

* revert back accidentally deleted elif

* Update tests/model_hub_tests/torch_tests/test_hf_transformers.py

* Update tests/model_hub_tests/torch_tests/test_hf_transformers.py

---------

Co-authored-by: Maxim Vafin <maxim.vafin@intel.com>
---
 .../torch_tests/hf_transformers_models        |  6 +--
 .../torch_tests/test_hf_transformers.py       | 41 +++++++++++++++++++
 2 files changed, 44 insertions(+), 3 deletions(-)

diff --git a/tests/model_hub_tests/torch_tests/hf_transformers_models b/tests/model_hub_tests/torch_tests/hf_transformers_models
index b63e9eae171..cf699be6289 100644
--- a/tests/model_hub_tests/torch_tests/hf_transformers_models
+++ b/tests/model_hub_tests/torch_tests/hf_transformers_models
@@ -74,8 +74,8 @@ facebook/flava-image-codebook,flava_image_codebook,skip,Load problem
 facebook/m2m100_418M,m2m_100
 facebook/mask2former-swin-base-coco-panoptic,mask2former
 facebook/maskformer-swin-base-coco,maskformer
-facebook/mms-tts-eng,vits,skip,Load problem
-facebook/musicgen-small,musicgen,skip,Load problem
+facebook/mms-tts-eng,vits,xfail,Accuracy failed: results cannot be broadcasted
+facebook/musicgen-small,musicgen
 facebook/opt-125m,opt
 facebook/rag-token-nq,rag,skip,Load problem
 facebook/sam-vit-large,sam,xfail,No node with name original_sizes
@@ -104,7 +104,7 @@ google/mobilebert-uncased,mobilebert
 google/mobilenet_v1_0.75_192,mobilenet_v1
 google/mt5-base,mt5
 google/owlvit-base-patch32,owlvit
-google/pix2struct-docvqa-base,pix2struct,skip,Load problem
+google/pix2struct-docvqa-base,pix2struct
 google/realm-orqa-nq-openqa,realm,skip,Load problem
 google/reformer-crime-and-punishment,reformer,xfail,Tracing problem
 google/tapas-large-finetuned-wtq,tapas
diff --git a/tests/model_hub_tests/torch_tests/test_hf_transformers.py b/tests/model_hub_tests/torch_tests/test_hf_transformers.py
index 6ad74a8ffb5..e13fa5a6b9c 100644
--- a/tests/model_hub_tests/torch_tests/test_hf_transformers.py
+++ b/tests/model_hub_tests/torch_tests/test_hf_transformers.py
@@ -88,6 +88,26 @@ class TestTransformersModel(TestTorchConvertModel):
 
             model = VIT_GPT2_Model(model)
             example = (encoded_input.pixel_values,)
+        elif 'pix2struct' in mi.tags:
+            from transformers import AutoProcessor, Pix2StructForConditionalGeneration
+            model = Pix2StructForConditionalGeneration.from_pretrained(name)
+            processor = AutoProcessor.from_pretrained(name)
+
+            import requests
+            from PIL import Image
+            image_url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg"
+            image = Image.open(requests.get(image_url, stream=True).raw)
+            question = "What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud"
+            inputs = processor(images=image, text=question, return_tensors="pt")
+            example = dict(inputs)
+
+            class DecoratorModelForSeq2SeqLM(torch.nn.Module):
+                def __init__(self, model):
+                    super().__init__()
+                    self.model = model
+                def forward(self, flattened_patches, attention_mask):
+                    return self.model.generate(flattened_patches=flattened_patches, attention_mask=attention_mask)
+            model = DecoratorModelForSeq2SeqLM(model)
         elif "mms-lid" in name:
             # mms-lid model config does not have auto_model attribute, only direct loading available
             from transformers import Wav2Vec2ForSequenceClassification, AutoFeatureExtractor
@@ -149,6 +169,27 @@ class TestTransformersModel(TestTorchConvertModel):
                 0, 255, [16, 3, 224, 224]).to(torch.float32))
             inputs = processor(video, return_tensors="pt")
             example = dict(inputs)
+        elif 'text-to-speech' in mi.tags:
+            from transformers import AutoTokenizer
+            tokenizer = AutoTokenizer.from_pretrained(name)
+            text = "some example text in the English language"
+            inputs = tokenizer(text, return_tensors="pt")
+            example = dict(inputs)
+        elif 'musicgen' in mi.tags:
+            from transformers import AutoProcessor, AutoModelForTextToWaveform
+            processor = AutoProcessor.from_pretrained(name)
+            model = AutoModelForTextToWaveform.from_pretrained(name, torchscript=True)
+            
+            inputs = processor(
+                text=["80s pop track with bassy drums and synth"],
+                padding=True,
+                return_tensors="pt",
+            )
+            example = dict(inputs)
+            # works for facebook/musicgen-small
+            pad_token_id = model.generation_config.pad_token_id
+            example["decoder_input_ids"] = torch.ones(
+                (inputs.input_ids.shape[0] * model.decoder.num_codebooks, 1), dtype=torch.long) * pad_token_id
         else:
             try:
                 if auto_model == "AutoModelForCausalLM":