We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Describe the bug I'm trying to optimize whisper int8 gpu and the process is killed.
To Reproduce Optimize whisper int8 medium on cuda 11 on ubuntu with 30GB ram.
Expected behavior It should use less ram. I have 30GB isn't that enough? whisper tiny works.
Olive config
{ "input_model": { "type": "CompositeModel", "model_component_names": [ "encoder_decoder_init", "decoder" ], "model_components": [ { "type": "PyTorchModel", "model_path": "openai/whisper-medium", "model_script": "code/user_script.py", "script_dir": "code", "model_loader": "get_encoder_decoder_init", "io_config": "get_encdec_io_config", "dummy_inputs_func": "encoder_decoder_init_dummy_inputs" }, { "type": "PyTorchModel", "model_path": "openai/whisper-medium", "model_script": "code/user_script.py", "script_dir": "code", "model_loader": "get_decoder", "io_config": "get_dec_io_config", "dummy_inputs_func": "decoder_dummy_inputs" } ], "model_attributes": { "vocab_size": 51865, "num_mel_bins": 80, "d_model": 1024, "encoder_layers": 24, "encoder_attention_heads": 16, "decoder_layers": 24, "decoder_attention_heads": 16, "decoder_ffn_dim": 4096, "encoder_ffn_dim": 4096, "dropout": 0.0, "attention_dropout": 0.0, "activation_dropout": 0.0, "activation_function": "gelu", "init_std": 0.02, "encoder_layerdrop": 0.0, "decoder_layerdrop": 0.0, "use_cache": true, "num_hidden_layers": 24, "scale_embedding": false, "max_source_positions": 1500, "max_target_positions": 448, "classifier_proj_size": 256, "use_weighted_layer_sum": false, "apply_spec_augment": false, "mask_time_prob": 0.05, "mask_time_length": 10, "mask_time_min_masks": 2, "mask_feature_prob": 0.0, "mask_feature_length": 10, "mask_feature_min_masks": 0, "median_filter_width": 7, "return_dict": true, "output_hidden_states": false, "output_attentions": false, "torchscript": false, "torch_dtype": "float32", "use_bfloat16": false, "tf_legacy_loss": false, "pruned_heads": {}, "tie_word_embeddings": true, "chunk_size_feed_forward": 0, "is_encoder_decoder": true, "is_decoder": false, "cross_attention_hidden_size": null, "add_cross_attention": false, "tie_encoder_decoder": false, "max_length": 448, "min_length": 0, "do_sample": false, "early_stopping": false, "num_beams": 1, "num_beam_groups": 1, "diversity_penalty": 0.0, "temperature": 1.0, "top_k": 50, "top_p": 1.0, "typical_p": 1.0, "repetition_penalty": 1.0, "length_penalty": 1.0, "no_repeat_ngram_size": 0, "encoder_no_repeat_ngram_size": 0, "bad_words_ids": null, "num_return_sequences": 1, "output_scores": false, "return_dict_in_generate": false, "forced_bos_token_id": null, "forced_eos_token_id": null, "remove_invalid_values": false, "exponential_decay_length_penalty": null, "begin_suppress_tokens": [ 220, 50257 ], "architectures": [ "WhisperForConditionalGeneration" ], "finetuning_task": null, "id2label": { "0": "LABEL_0", "1": "LABEL_1" }, "label2id": { "LABEL_0": 0, "LABEL_1": 1 }, "tokenizer_class": null, "prefix": null, "bos_token_id": 50257, "pad_token_id": 50257, "eos_token_id": 50257, "sep_token_id": null, "decoder_start_token_id": 50258, "task_specific_params": null, "problem_type": null, "_name_or_path": "openai/whisper-medium", "transformers_version": "4.42.4", "forced_decoder_ids": [ [ 1, 50259 ], [ 2, 50359 ], [ 3, 50363 ] ], "model_type": "whisper" } }, "systems": { "local_system": { "type": "LocalSystem", "accelerators": [ { "device": "gpu", "execution_providers": [ "CUDAExecutionProvider" ] } ] } }, "data_configs": [ { "name": "latency_data_config", "user_script": "code/user_script.py", "script_dir": "code", "load_dataset_config": { "type": "whisper_dataset", "data_dir": "data", "model_name": "openai/whisper-medium", "use_audio_decoder": true }, "dataloader_config": { "type": "no_auto_batch_dataloader" } } ], "evaluators": { "common_evaluator": { "metrics": [ { "name": "latency", "type": "latency", "sub_types": [ { "name": "avg", "priority": 1 } ], "data_config": "latency_data_config" } ] } }, "passes": { "conversion": { "type": "OnnxConversion", "target_opset": 17 }, "transformers_optimization": { "type": "OrtTransformersOptimization", "optimization_options": { "use_multi_head_attention": true }, "use_gpu": true }, "onnx_dynamic_quantization": { "type": "OnnxDynamicQuantization", "per_channel": false, "reduce_range": false, "op_types_to_quantize": [ "MatMul", "Gemm", "Gather" ], "MatMulConstBOnly": false }, "insert_beam_search": { "type": "InsertBeamSearch", "use_forced_decoder_ids": true, "use_logits_processor": false, "fp16": false }, "prepost": { "type": "AppendPrePostProcessingOps", "tool_command": "whisper", "tool_command_args": { "model_name": "openai/whisper-medium", "use_audio_decoder": true }, "target_opset": 17 } }, "log_severity_level": 0, "host": "local_system", "target": "local_system", "evaluator": "common_evaluator", "evaluate_input_model": false, "clean_cache": false, "cache_dir": "cache", "output_dir": "models", "output_name": "whisper_gpu_int8" }
Olive logs
[2024-08-10 19:48:37,393] [INFO] [run.py:138:run_engine] Running workflow default_workflow [2024-08-10 19:48:37,433] [INFO] [cache.py:51:__init__] Using cache directory: /root/home/Olive/examples/whisper/cache/default_workflow [2024-08-10 19:48:37,435] [INFO] [engine.py:1020:save_olive_config] Saved Olive config to /root/home/Olive/examples/whisper/cache/default_workflow/olive_config.json [2024-08-10 19:48:37,435] [DEBUG] [run.py:182:run_engine] Registering pass onnxconversion [2024-08-10 19:48:37,436] [DEBUG] [run.py:182:run_engine] Registering pass orttransformersoptimization [2024-08-10 19:48:37,436] [DEBUG] [run.py:182:run_engine] Registering pass onnxdynamicquantization [2024-08-10 19:48:37,437] [DEBUG] [run.py:182:run_engine] Registering pass insertbeamsearch [2024-08-10 19:48:37,438] [DEBUG] [run.py:182:run_engine] Registering pass appendprepostprocessingops [2024-08-10 19:48:37,440] [DEBUG] [accelerator_creator.py:130:_fill_accelerators] The accelerator device and execution providers are specified, skipping deduce. [2024-08-10 19:48:37,440] [DEBUG] [accelerator_creator.py:169:_check_execution_providers] Supported execution providers for device gpu: ['TensorrtExecutionProvider', 'CUDAExecutionProvider', 'CPUExecutionProvider'] [2024-08-10 19:48:37,440] [DEBUG] [accelerator_creator.py:199:create_accelerators] Initial accelerators and execution providers: {'gpu': ['CUDAExecutionProvider']} [2024-08-10 19:48:37,440] [INFO] [accelerator_creator.py:224:create_accelerators] Running workflow on accelerator specs: gpu-cuda [2024-08-10 19:48:37,440] [DEBUG] [run.py:238:run_engine] Pass onnxconversion already registered [2024-08-10 19:48:37,440] [DEBUG] [run.py:238:run_engine] Pass orttransformersoptimization already registered [2024-08-10 19:48:37,440] [DEBUG] [run.py:238:run_engine] Pass onnxdynamicquantization already registered [2024-08-10 19:48:37,440] [DEBUG] [run.py:238:run_engine] Pass insertbeamsearch already registered [2024-08-10 19:48:37,440] [DEBUG] [run.py:238:run_engine] Pass appendprepostprocessingops already registered [2024-08-10 19:48:37,440] [DEBUG] [cache.py:304:set_cache_env] Set OLIVE_CACHE_DIR: /root/home/Olive/examples/whisper/cache/default_workflow [2024-08-10 19:48:37,457] [INFO] [engine.py:277:run] Running Olive on accelerator: gpu-cuda [2024-08-10 19:48:37,457] [INFO] [engine.py:1117:_create_system] Creating target system ... [2024-08-10 19:48:37,457] [DEBUG] [engine.py:1113:create_system] create native OliveSystem SystemType.Local [2024-08-10 19:48:37,457] [INFO] [engine.py:1120:_create_system] Target system created in 0.000150 seconds [2024-08-10 19:48:37,458] [INFO] [engine.py:1129:_create_system] Creating host system ... [2024-08-10 19:48:37,458] [DEBUG] [engine.py:1113:create_system] create native OliveSystem SystemType.Local [2024-08-10 19:48:37,458] [INFO] [engine.py:1132:_create_system] Host system created in 0.000098 seconds [2024-08-10 19:48:37,527] [DEBUG] [engine.py:717:_cache_model] Cached model 288c465c to /root/home/Olive/examples/whisper/cache/default_workflow/models/288c465c.json [2024-08-10 19:48:37,527] [DEBUG] [engine.py:352:run_accelerator] Running Olive in no-search mode ... [2024-08-10 19:48:37,527] [DEBUG] [engine.py:444:run_no_search] Running ['conversion', 'transformers_optimization', 'onnx_dynamic_quantization', 'insert_beam_search', 'prepost'] with no search ... [2024-08-10 19:48:37,527] [INFO] [engine.py:886:_run_pass] Running pass conversion:OnnxConversion [2024-08-10 19:48:37,528] [DEBUG] [engine.py:908:_run_pass] Loading model from cache ... [2024-08-10 19:48:37,528] [INFO] [engine.py:923:_run_pass] Loaded model from cache: 5_OnnxConversion-288c465c-5fa0d4af [2024-08-10 19:48:37,529] [INFO] [engine.py:886:_run_pass] Running pass transformers_optimization:OrtTransformersOptimization [2024-08-10 19:48:37,568] [DEBUG] [transformer_optimization.py:248:_run_for_config] model_type is set to bart from model attributes [2024-08-10 19:48:37,568] [DEBUG] [transformer_optimization.py:254:_run_for_config] num_heads is set to 16 from model attributes [2024-08-10 19:48:37,568] [DEBUG] [transformer_optimization.py:260:_run_for_config] hidden_size is set to 1024 from model attributes
Other information
Additional context It happens also with non gpu medium.int8 model.
Do you have pre built whisper models? It's very hard to create them. can you add tests for them to make sure it works for gpu / cpu? Thanks
The text was updated successfully, but these errors were encountered:
No branches or pull requests
Describe the bug
I'm trying to optimize whisper int8 gpu and the process is killed.
To Reproduce
Optimize whisper int8 medium on cuda 11 on ubuntu with 30GB ram.
Expected behavior
It should use less ram. I have 30GB isn't that enough? whisper tiny works.
Olive config
Olive logs
Other information
Additional context
It happens also with non gpu medium.int8 model.
Do you have pre built whisper models?
It's very hard to create them. can you add tests for them to make sure it works for gpu / cpu?
Thanks
The text was updated successfully, but these errors were encountered: