Optimize whisper medium gpu failed #1298

thewh1teagle · 2024-08-10T19:53:33Z

Describe the bug
I'm trying to optimize whisper int8 gpu and the process is killed.

To Reproduce
Optimize whisper int8 medium on cuda 11 on ubuntu with 30GB ram.

Expected behavior
It should use less ram. I have 30GB isn't that enough? whisper tiny works.

Olive config

{
    "input_model": {
        "type": "CompositeModel",
        "model_component_names": [
            "encoder_decoder_init",
            "decoder"
        ],
        "model_components": [
            {
                "type": "PyTorchModel",
                "model_path": "openai/whisper-medium",
                "model_script": "code/user_script.py",
                "script_dir": "code",
                "model_loader": "get_encoder_decoder_init",
                "io_config": "get_encdec_io_config",
                "dummy_inputs_func": "encoder_decoder_init_dummy_inputs"
            },
            {
                "type": "PyTorchModel",
                "model_path": "openai/whisper-medium",
                "model_script": "code/user_script.py",
                "script_dir": "code",
                "model_loader": "get_decoder",
                "io_config": "get_dec_io_config",
                "dummy_inputs_func": "decoder_dummy_inputs"
            }
        ],
        "model_attributes": {
            "vocab_size": 51865,
            "num_mel_bins": 80,
            "d_model": 1024,
            "encoder_layers": 24,
            "encoder_attention_heads": 16,
            "decoder_layers": 24,
            "decoder_attention_heads": 16,
            "decoder_ffn_dim": 4096,
            "encoder_ffn_dim": 4096,
            "dropout": 0.0,
            "attention_dropout": 0.0,
            "activation_dropout": 0.0,
            "activation_function": "gelu",
            "init_std": 0.02,
            "encoder_layerdrop": 0.0,
            "decoder_layerdrop": 0.0,
            "use_cache": true,
            "num_hidden_layers": 24,
            "scale_embedding": false,
            "max_source_positions": 1500,
            "max_target_positions": 448,
            "classifier_proj_size": 256,
            "use_weighted_layer_sum": false,
            "apply_spec_augment": false,
            "mask_time_prob": 0.05,
            "mask_time_length": 10,
            "mask_time_min_masks": 2,
            "mask_feature_prob": 0.0,
            "mask_feature_length": 10,
            "mask_feature_min_masks": 0,
            "median_filter_width": 7,
            "return_dict": true,
            "output_hidden_states": false,
            "output_attentions": false,
            "torchscript": false,
            "torch_dtype": "float32",
            "use_bfloat16": false,
            "tf_legacy_loss": false,
            "pruned_heads": {},
            "tie_word_embeddings": true,
            "chunk_size_feed_forward": 0,
            "is_encoder_decoder": true,
            "is_decoder": false,
            "cross_attention_hidden_size": null,
            "add_cross_attention": false,
            "tie_encoder_decoder": false,
            "max_length": 448,
            "min_length": 0,
            "do_sample": false,
            "early_stopping": false,
            "num_beams": 1,
            "num_beam_groups": 1,
            "diversity_penalty": 0.0,
            "temperature": 1.0,
            "top_k": 50,
            "top_p": 1.0,
            "typical_p": 1.0,
            "repetition_penalty": 1.0,
            "length_penalty": 1.0,
            "no_repeat_ngram_size": 0,
            "encoder_no_repeat_ngram_size": 0,
            "bad_words_ids": null,
            "num_return_sequences": 1,
            "output_scores": false,
            "return_dict_in_generate": false,
            "forced_bos_token_id": null,
            "forced_eos_token_id": null,
            "remove_invalid_values": false,
            "exponential_decay_length_penalty": null,
            "begin_suppress_tokens": [
                220,
                50257
            ],
            "architectures": [
                "WhisperForConditionalGeneration"
            ],
            "finetuning_task": null,
            "id2label": {
                "0": "LABEL_0",
                "1": "LABEL_1"
            },
            "label2id": {
                "LABEL_0": 0,
                "LABEL_1": 1
            },
            "tokenizer_class": null,
            "prefix": null,
            "bos_token_id": 50257,
            "pad_token_id": 50257,
            "eos_token_id": 50257,
            "sep_token_id": null,
            "decoder_start_token_id": 50258,
            "task_specific_params": null,
            "problem_type": null,
            "_name_or_path": "openai/whisper-medium",
            "transformers_version": "4.42.4",
            "forced_decoder_ids": [
                [
                    1,
                    50259
                ],
                [
                    2,
                    50359
                ],
                [
                    3,
                    50363
                ]
            ],
            "model_type": "whisper"
        }
    },
    "systems": {
        "local_system": {
            "type": "LocalSystem",
            "accelerators": [
                {
                    "device": "gpu",
                    "execution_providers": [
                        "CUDAExecutionProvider"
                    ]
                }
            ]
        }
    },
    "data_configs": [
        {
            "name": "latency_data_config",
            "user_script": "code/user_script.py",
            "script_dir": "code",
            "load_dataset_config": {
                "type": "whisper_dataset",
                "data_dir": "data",
                "model_name": "openai/whisper-medium",
                "use_audio_decoder": true
            },
            "dataloader_config": {
                "type": "no_auto_batch_dataloader"
            }
        }
    ],
    "evaluators": {
        "common_evaluator": {
            "metrics": [
                {
                    "name": "latency",
                    "type": "latency",
                    "sub_types": [
                        {
                            "name": "avg",
                            "priority": 1
                        }
                    ],
                    "data_config": "latency_data_config"
                }
            ]
        }
    },
    "passes": {
        "conversion": {
            "type": "OnnxConversion",
            "target_opset": 17
        },
        "transformers_optimization": {
            "type": "OrtTransformersOptimization",
            "optimization_options": {
                "use_multi_head_attention": true
            },
            "use_gpu": true
        },
        "onnx_dynamic_quantization": {
            "type": "OnnxDynamicQuantization",
            "per_channel": false,
            "reduce_range": false,
            "op_types_to_quantize": [
                "MatMul",
                "Gemm",
                "Gather"
            ],
            "MatMulConstBOnly": false
        },
        "insert_beam_search": {
            "type": "InsertBeamSearch",
            "use_forced_decoder_ids": true,
            "use_logits_processor": false,
            "fp16": false
        },
        "prepost": {
            "type": "AppendPrePostProcessingOps",
            "tool_command": "whisper",
            "tool_command_args": {
                "model_name": "openai/whisper-medium",
                "use_audio_decoder": true
            },
            "target_opset": 17
        }
    },
    "log_severity_level": 0,
    "host": "local_system",
    "target": "local_system",
    "evaluator": "common_evaluator",
    "evaluate_input_model": false,
    "clean_cache": false,
    "cache_dir": "cache",
    "output_dir": "models",
    "output_name": "whisper_gpu_int8"
}

Olive logs

[2024-08-10 19:48:37,393] [INFO] [run.py:138:run_engine] Running workflow default_workflow
[2024-08-10 19:48:37,433] [INFO] [cache.py:51:__init__] Using cache directory: /root/home/Olive/examples/whisper/cache/default_workflow
[2024-08-10 19:48:37,435] [INFO] [engine.py:1020:save_olive_config] Saved Olive config to /root/home/Olive/examples/whisper/cache/default_workflow/olive_config.json
[2024-08-10 19:48:37,435] [DEBUG] [run.py:182:run_engine] Registering pass onnxconversion
[2024-08-10 19:48:37,436] [DEBUG] [run.py:182:run_engine] Registering pass orttransformersoptimization
[2024-08-10 19:48:37,436] [DEBUG] [run.py:182:run_engine] Registering pass onnxdynamicquantization
[2024-08-10 19:48:37,437] [DEBUG] [run.py:182:run_engine] Registering pass insertbeamsearch
[2024-08-10 19:48:37,438] [DEBUG] [run.py:182:run_engine] Registering pass appendprepostprocessingops
[2024-08-10 19:48:37,440] [DEBUG] [accelerator_creator.py:130:_fill_accelerators] The accelerator device and execution providers are specified, skipping deduce.
[2024-08-10 19:48:37,440] [DEBUG] [accelerator_creator.py:169:_check_execution_providers] Supported execution providers for device gpu: ['TensorrtExecutionProvider', 'CUDAExecutionProvider', 'CPUExecutionProvider']
[2024-08-10 19:48:37,440] [DEBUG] [accelerator_creator.py:199:create_accelerators] Initial accelerators and execution providers: {'gpu': ['CUDAExecutionProvider']}
[2024-08-10 19:48:37,440] [INFO] [accelerator_creator.py:224:create_accelerators] Running workflow on accelerator specs: gpu-cuda
[2024-08-10 19:48:37,440] [DEBUG] [run.py:238:run_engine] Pass onnxconversion already registered
[2024-08-10 19:48:37,440] [DEBUG] [run.py:238:run_engine] Pass orttransformersoptimization already registered
[2024-08-10 19:48:37,440] [DEBUG] [run.py:238:run_engine] Pass onnxdynamicquantization already registered
[2024-08-10 19:48:37,440] [DEBUG] [run.py:238:run_engine] Pass insertbeamsearch already registered
[2024-08-10 19:48:37,440] [DEBUG] [run.py:238:run_engine] Pass appendprepostprocessingops already registered
[2024-08-10 19:48:37,440] [DEBUG] [cache.py:304:set_cache_env] Set OLIVE_CACHE_DIR: /root/home/Olive/examples/whisper/cache/default_workflow
[2024-08-10 19:48:37,457] [INFO] [engine.py:277:run] Running Olive on accelerator: gpu-cuda
[2024-08-10 19:48:37,457] [INFO] [engine.py:1117:_create_system] Creating target system ...
[2024-08-10 19:48:37,457] [DEBUG] [engine.py:1113:create_system] create native OliveSystem SystemType.Local
[2024-08-10 19:48:37,457] [INFO] [engine.py:1120:_create_system] Target system created in 0.000150 seconds
[2024-08-10 19:48:37,458] [INFO] [engine.py:1129:_create_system] Creating host system ...
[2024-08-10 19:48:37,458] [DEBUG] [engine.py:1113:create_system] create native OliveSystem SystemType.Local
[2024-08-10 19:48:37,458] [INFO] [engine.py:1132:_create_system] Host system created in 0.000098 seconds
[2024-08-10 19:48:37,527] [DEBUG] [engine.py:717:_cache_model] Cached model 288c465c to /root/home/Olive/examples/whisper/cache/default_workflow/models/288c465c.json
[2024-08-10 19:48:37,527] [DEBUG] [engine.py:352:run_accelerator] Running Olive in no-search mode ...
[2024-08-10 19:48:37,527] [DEBUG] [engine.py:444:run_no_search] Running ['conversion', 'transformers_optimization', 'onnx_dynamic_quantization', 'insert_beam_search', 'prepost'] with no search ...
[2024-08-10 19:48:37,527] [INFO] [engine.py:886:_run_pass] Running pass conversion:OnnxConversion
[2024-08-10 19:48:37,528] [DEBUG] [engine.py:908:_run_pass] Loading model from cache ...
[2024-08-10 19:48:37,528] [INFO] [engine.py:923:_run_pass] Loaded model from cache: 5_OnnxConversion-288c465c-5fa0d4af
[2024-08-10 19:48:37,529] [INFO] [engine.py:886:_run_pass] Running pass transformers_optimization:OrtTransformersOptimization
[2024-08-10 19:48:37,568] [DEBUG] [transformer_optimization.py:248:_run_for_config] model_type is set to bart from model attributes
[2024-08-10 19:48:37,568] [DEBUG] [transformer_optimization.py:254:_run_for_config] num_heads is set to 16 from model attributes
[2024-08-10 19:48:37,568] [DEBUG] [transformer_optimization.py:260:_run_for_config] hidden_size is set to 1024 from model attributes

Other information

OS: Linux
Olive version: 1f8f4f8
ONNXRuntime package and version: onnxruntime-gpu==1.18.1

Additional context
It happens also with non gpu medium.int8 model.

Do you have pre built whisper models?
It's very hard to create them. can you add tests for them to make sure it works for gpu / cpu?
Thanks

The text was updated successfully, but these errors were encountered:

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Optimize whisper medium gpu failed #1298

Optimize whisper medium gpu failed #1298

thewh1teagle commented Aug 10, 2024 •

edited

Loading

Optimize whisper medium gpu failed #1298

Optimize whisper medium gpu failed #1298

Comments

thewh1teagle commented Aug 10, 2024 • edited Loading

thewh1teagle commented Aug 10, 2024 •

edited

Loading