Add a python script for generating text using huggingface gpt2 (#2983)

tungld · web-flow · commit 012f33ca777b · 2024-10-24T13:27:40.000+09:00
* Add a script for generating text using gpt2

Signed-off-by: Tung D. Le &lt;tung@jp.ibm.com&gt;


---------

Signed-off-by: Tung D. Le &lt;tung@jp.ibm.com&gt;
diff --git a/utils/python/transformers/run_gpt2_from_huggingface.py b/utils/python/transformers/run_gpt2_from_huggingface.py
@@ -0,0 +1,148 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: Apache-2.0
+
+##################### run_gpt2_from_huggingface.py #############################
+#
+# Copyright 2019-2024 The IBM Research Authors.
+#
+################################################################################
+#
+# This script is to run GPT2 from HuggingFace.
+# Command: ONNX_MLIR_HOME=/workdir/onnx-mlir/build/Debug python run_gpt2_from_huggingface.py 2>&1 | tee log.txt
+#
+# When running this script for the first time, it will download onnx models from
+# HuggingFace and compile the models. The onnx models and compiled models are
+# cached in the current folder (by default).
+#
+# Change compile_flags if targeting a different machine.
+################################################################################
+
+import os
+import sys
+import time
+import json
+import requests as req
+from urllib.request import urlretrieve
+
+import numpy as np
+from transformers import AutoTokenizer
+
+# Include runtime directory in python paths, so PyRuntime can be imported.
+RUNTIME_DIR = os.path.join(os.environ["ONNX_MLIR_HOME"], "lib")
+sys.path.append(RUNTIME_DIR)
+try:
+    from PyCompileAndRuntime import OMCompileExecutionSession
+except ImportError:
+    raise ImportError(
+        "Looks like you did not build the PyRuntime target, build it by running `make PyRuntime`."
+        "You may need to set ONNX_MLIR_HOME to `onnx-mlir/build/Debug` since `make PyRuntime` outputs to `build/Debug` by default"
+    )
+
+# Information to download onnx models from HuggingFace.
+model_name_or_path = "gpt2"  # can be gpt2, gpt2-medium, gpt2-large
+decoder_model_name = "decoder_model.onnx"
+decoder_with_past_model_name = "decoder_with_past_model.onnx"
+config_json_name = "config.json"
+decoder_url = f"https://huggingface.co/openai-community/{model_name_or_path}/resolve/main/onnx/{decoder_model_name}"
+decoder_with_past_url = f"https://huggingface.co/openai-community/{model_name_or_path}/resolve/main/onnx/{decoder_with_past_model_name}"
+config_json_url = f"https://huggingface.co/openai-community/{model_name_or_path}/resolve/main/onnx/{config_json_name}"
+
+# Local directories for caching the model.
+cache_dir = "./"
+decoder_model_path = f"{cache_dir}/{decoder_model_name}"
+decoder_with_past_model_path = f"{cache_dir}/{decoder_with_past_model_name}"
+config_json_path = f"{cache_dir}/{config_json_name}"
+
+# Download the model to a local dir.
+if not os.path.exists(decoder_model_path):
+    print(f"Downloading {decoder_url}")
+    urlretrieve(decoder_url, decoder_model_path)
+    print("Done")
+if req.head(f"{decoder_url}_data", allow_redirects=True).status_code == 200:
+    print(f"Downloading {decoder_url}_data")
+    urlretrieve(decoder_url + "_data", decoder_model_path + "_data")
+    print("Done")
+if not os.path.exists(decoder_with_past_model_path):
+    print(f"Downloading {decoder_with_past_url}")
+    urlretrieve(decoder_with_past_url, decoder_with_past_model_path)
+    print("Done")
+if req.head(f"{decoder_with_past_url}_data", allow_redirects=True).status_code == 200:
+    print(f"Downloading {decoder_with_past_url}_data")
+    urlretrieve(decoder_with_past_url + "_data", decoder_with_past_model_path + "_data")
+    print("Done")
+if not os.path.exists(config_json_path):
+    print(f"Downloading the config json file {config_json_url}")
+    urlretrieve(config_json_url, config_json_path)
+    print("Done")
+
+with open(config_json_path) as f:
+    cfg = json.load(f)
+    print("Model configuration: {}\n".format(cfg))
+    num_attention_heads = cfg["n_head"]
+    hidden_size = cfg["n_embd"]
+    num_layers = cfg["n_layer"]
+    eos_token_id = cfg["eos_token_id"]
+
+# Create CompileExecutionSession to compile and run the model,
+compile_flags = "-O3 -v --onnx-op-stats TXT"
+# compile_flags = "-O3 -mcpu=z16 -maccel=NNPA -v --onnx-op-stats TXT"
+decoder_sess = OMCompileExecutionSession(
+    decoder_model_path, compile_flags + " -tag=decoder", reuse_compiled_model=1
+)
+decoder_with_past_sess = OMCompileExecutionSession(
+    decoder_with_past_model_path,
+    compile_flags + " -tag=decoder_with_past",
+    reuse_compiled_model=1,
+)
+
+# Setup a tokenizer.
+tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, cache_dir=cache_dir)
+
+# Tokenize the input text.
+prompt_text = "Which is the highest mountain in Japan?"
+pt_inputs = tokenizer(prompt_text, return_tensors="pt")
+output_length = 13
+
+# Generate tokens.
+ts = []
+num_runs = 3
+for r in range(num_runs):
+    output_ids = []
+    kv_cache = None
+
+    t = 0
+    attention_mask = pt_inputs["attention_mask"].numpy()
+    inputs = [pt_inputs["input_ids"].numpy(), attention_mask]
+    for step in range(output_length):
+        t0 = time.time()
+        if kv_cache is None:
+            outputs = decoder_sess.run(inputs)
+        else:
+            outputs = decoder_with_past_sess.run(inputs)
+        t_elap = time.time() - t0
+        t += t_elap
+        # Greedy approach is used here.
+        logits = outputs[0][:, -1, :]
+        next_id = np.argmax(logits, axis=1, keepdims=True)
+        kv_cache = outputs[1:]
+        # Only for batchsize = 1
+        attention_mask = np.append(
+            attention_mask, np.array([[1]], dtype=np.int64), axis=1
+        )
+        inputs = [next_id] + kv_cache + [attention_mask]
+        output_ids += [next_id[0][0]]
+
+    ts += [t]
+    if r == num_runs - 1:
+        # Expected answer: "The highest mountain in Japan is the Mt. Fuji."
+        print("Prompt: {}".format(prompt_text))
+        print("Generated words: {}\n".format(tokenizer.decode(output_ids).strip()))
+
+print("times", ts)
+t = np.min(ts)
+print("t_elap: %.2f seconds" % (t))
+print(
+    "Latency: {} msec/token, thoughput: {} tokens/sec".format(
+        t / output_length * 1000, output_length / t
+    )
+)