Google AI Edge Portal 隆重推出：大規模基準測試 Edge AI。申請在非公開預先發布版期間要求存取權。

實作自訂委派

A LiteRT 「委派」可讓您：在其他執行程式上執行模型 (部分或整個)。這個機制可將各種裝置端加速器，例如 GPU 或 Edge TPU (Tensor Processing Unit)。這可讓開發人員享有靈活彈性從預設 TFLite 分離方法，以加快推論速度。

下方圖表概述委派代表，詳情請參閱以下各節。

TFLite 委派

何時該建立自訂委派代表？

LiteRT 有各種適用於目標加速器的委派項目，例如 GPU、DSP 和 EdgeTPU

下列情況適合自行建立委派代表：

您想要整合所有不支援新的機器學習推論引擎現有的委派代表。
您有一個自訂硬體加速器能改善目前已知的硬體執行階段情境
您正在開發可調整的 CPU 最佳化項目 (例如運算子融合) 加快某些模型的速度

委派代表的運作方式為何？

建議您使用一個簡單的模型圖，如下方所示，以及委派的「MyDelegate」較快實作 Conv2D 和平均值作業

原始圖表

套用這個「MyDelegate」後，原始 LiteRT 圖形就會更新後的程式碼如下所示：

顯示委派代表的圖表

上方圖表是 LiteRT 分割原始圖形的管道所取得下列兩項規則：

委派項目可處理的特定作業會放入同時滿足原始運算工作流程不同作業之間的依附元件
每個待委派分區僅有未啟用的輸入和輸出節點管理。

每個委派項目處理的分區都會由委派節點取代 (可以做為委派核心)，並且評估套用這些分區的資料

視模型而定，最終圖表最終可能會包含一或多個節點，後者代表委派作業不支援某些作業。一般來說不必由委派代表處理多個分區從委派代表切換至主要圖表時，將委派子圖表的結果傳遞至結果的主要圖表或因記憶體副本而產生資料 (例如從 GPU 到 CPU)這類負擔可能會抵銷尤其在記憶體大量複製時，效能會有所提升。

導入您自己的自訂委派

我們建議透過以下方式新增委派代表： SimpleDelegate API：

如要建立新的委派代表，您必須實作 2 個介面，並提供介面方法的實作。

1 至 `SimpleDelegateInterface`

這個類別代表委派作業的功能，以及用於建立核心類別，以便封裝委派圖表。詳情請參閱 C++ 標頭檔案。程式碼中的註解會詳細說明各個 API。

2 - `SimpleDelegateKernelInterface`

這個類別會封裝用於初始化 / 準備 / 及執行或委派給管理員。

它具備：(請參閱定義)

Init(...)：系統會呼叫一次，執行任何一次性初始化作業。
Prepare(...)：針對這個節點的每個執行個體呼叫 - 這項操作若您有多個委派分區通常希望你有一天所以會在每次調整張量大小時呼叫這個方法。
Invoke(...)：系統會呼叫此方法進行推論。

範例

在這個範例中，您將建立非常簡單的委派代表，只能支援 2 個僅支援使用 float32 張量的運算類型 (ADD) 和 (SUB)。

// MyDelegate implements the interface of SimpleDelegateInterface.
// This holds the Delegate capabilities.
class MyDelegate : public SimpleDelegateInterface {
 public:
  bool IsNodeSupportedByDelegate(const TfLiteRegistration* registration,
                                 const TfLiteNode* node,
                                 TfLiteContext* context) const override {
    // Only supports Add and Sub ops.
    if (kTfLiteBuiltinAdd != registration->builtin_code &&
        kTfLiteBuiltinSub != registration->builtin_code)
      return false;
    // This delegate only supports float32 types.
    for (int i = 0; i < node->inputs->size; ++i) {
      auto& tensor = context->tensors[node->inputs->data[i]];
      if (tensor.type != kTfLiteFloat32) return false;
    }
    return true;
  }

  TfLiteStatus Initialize(TfLiteContext* context) override { return kTfLiteOk; }

  const char* Name() const override {
    static constexpr char kName[] = "MyDelegate";
    return kName;
  }

  std::unique_ptr<SimpleDelegateKernelInterface> CreateDelegateKernelInterface()
      override {
    return std::make_unique<MyDelegateKernel>();
  }
};

接著，從 SimpleDelegateKernelInterface

// My delegate kernel.
class MyDelegateKernel : public SimpleDelegateKernelInterface {
 public:
  TfLiteStatus Init(TfLiteContext* context,
                    const TfLiteDelegateParams* params) override {
    // Save index to all nodes which are part of this delegate.
    inputs_.resize(params->nodes_to_replace->size);
    outputs_.resize(params->nodes_to_replace->size);
    builtin_code_.resize(params->nodes_to_replace->size);
    for (int i = 0; i < params->nodes_to_replace->size; ++i) {
      const int node_index = params->nodes_to_replace->data[i];
      // Get this node information.
      TfLiteNode* delegated_node = nullptr;
      TfLiteRegistration* delegated_node_registration = nullptr;
      TF_LITE_ENSURE_EQ(
          context,
          context->GetNodeAndRegistration(context, node_index, &delegated_node,
                                          &delegated_node_registration),
          kTfLiteOk);
      inputs_[i].push_back(delegated_node->inputs->data[0]);
      inputs_[i].push_back(delegated_node->inputs->data[1]);
      outputs_[i].push_back(delegated_node->outputs->data[0]);
      builtin_code_[i] = delegated_node_registration->builtin_code;
    }
    return kTfLiteOk;
  }

  TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) override {
    return kTfLiteOk;
  }

  TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) override {
    // Evaluate the delegated graph.
    // Here we loop over all the delegated nodes.
    // We know that all the nodes are either ADD or SUB operations and the
    // number of nodes equals ''inputs_.size()'' and inputs[i] is a list of
    // tensor indices for inputs to node ''i'', while outputs_[i] is the list of
    // outputs for node
    // ''i''. Note, that it is intentional we have simple implementation as this
    // is for demonstration.

    for (int i = 0; i < inputs_.size(); ++i) {
      // Get the node input tensors.
      // Add/Sub operation accepts 2 inputs.
      auto& input_tensor_1 = context->tensors[inputs_[i][0]];
      auto& input_tensor_2 = context->tensors[inputs_[i][1]];
      auto& output_tensor = context->tensors[outputs_[i][0]];
      TF_LITE_ENSURE_EQ(
          context,
          ComputeResult(context, builtin_code_[i], &input_tensor_1,
                        &input_tensor_2, &output_tensor),
          kTfLiteOk);
    }
    return kTfLiteOk;
  }

 private:
  // Computes the result of addition of 'input_tensor_1' and 'input_tensor_2'
  // and store the result in 'output_tensor'.
  TfLiteStatus ComputeResult(TfLiteContext* context, int builtin_code,
                             const TfLiteTensor* input_tensor_1,
                             const TfLiteTensor* input_tensor_2,
                             TfLiteTensor* output_tensor) {
    if (NumElements(input_tensor_1) != NumElements(input_tensor_2) ||
        NumElements(input_tensor_1) != NumElements(output_tensor)) {
      return kTfLiteDelegateError;
    }
    // This code assumes no activation, and no broadcasting needed (both inputs
    // have the same size).
    auto* input_1 = GetTensorData<float>(input_tensor_1);
    auto* input_2 = GetTensorData<float>(input_tensor_2);
    auto* output = GetTensorData<float>(output_tensor);
    for (int i = 0; i < NumElements(input_tensor_1); ++i) {
      if (builtin_code == kTfLiteBuiltinAdd)
        output[i] = input_1[i] + input_2[i];
      else
        output[i] = input_1[i] - input_2[i];
    }
    return kTfLiteOk;
  }

  // Holds the indices of the input/output tensors.
  // inputs_[i] is list of all input tensors to node at index 'i'.
  // outputs_[i] is list of all output tensors to node at index 'i'.
  std::vector<std::vector<int>> inputs_, outputs_;
  // Holds the builtin code of the ops.
  // builtin_code_[i] is the type of node at index 'i'
  std::vector<int> builtin_code_;
};

對新委派代表進行基準測試及評估

TFLite 有一套工具可讓您快速測試 TFLite 模型。

模型基準工具：這項工具會採用 TFLite 模型，隨機產生輸入內容，然後重複會以指定次數執行模型輸出的延遲時間結尾的統計資料
推論差異工具：這項工具針對特定模型產生隨機高斯資料，並傳送介紹兩種 TFLite 解譯器，一種搭載單一執行緒 CPU 另一個則是透過使用者定義的規格這種模型會測量輸出張量之間的差異每項服務都以資產為依據這項工具也有助於對準確性偵錯以負載平衡機制分配流量即可降低應用程式發生效能問題的風險
圖片分類和圖片分類模型物件偵測這些工具可以在這裡

此外，TFLite 還有大量的核心和運算單元測試，以便測試新的委派作業，並確保一般 TFLite 執行路徑並未損毀。

如要重新使用 TFLite 測試和新委派工具，您可以使用使用下列任一選項：

使用委派註冊商以注意力機制為基礎
使用外部委派代表以注意力機制為基礎

選擇最佳方法

這兩種方法都需要進行幾項變更，詳情如下。不過，第一個方法，以靜態方式連結委派項目，且必須重新建構測試。基準化和評估工具相對地，第二步做為共用程式庫，並要求您公開建立/刪除方法。

因此，外部委派機制會與 TFLite 的預先建構的 LiteRT 工具二進位檔。但這較不明確，而且在自動化程序中設定可能較為複雜整合測試請使用委派註冊商的方法提供更清楚說明。

選項 1：善用委派註冊商

委派註冊商保存委派供應商清單，每個供應商都能輕鬆建立根據指令列旗標的 TFLite 委派代表，因此很方便工具將新的委派代表插入上述所有 LiteRT 工具請先建立新的委派供應商然後只修改 BUILD 規則即可。完整的範例整合程序如下所示 (如需程式碼這裡)。

假設您擁有實作 SimpleDelegate API 的委派，以及外部「C」建立/刪除這個「虛擬」的 API委派代表，如下所示：

// Returns default options for DummyDelegate.
DummyDelegateOptions TfLiteDummyDelegateOptionsDefault();

// Creates a new delegate instance that need to be destroyed with
// `TfLiteDummyDelegateDelete` when delegate is no longer used by TFLite.
// When `options` is set to `nullptr`, the above default values are used:
TfLiteDelegate* TfLiteDummyDelegateCreate(const DummyDelegateOptions* options);

// Destroys a delegate created with `TfLiteDummyDelegateCreate` call.
void TfLiteDummyDelegateDelete(TfLiteDelegate* delegate);

如要整合「DummyDelegate」與基準工具和推論工具，請委派供應商，如下所示：

class DummyDelegateProvider : public DelegateProvider {
 public:
  DummyDelegateProvider() {
    default_params_.AddParam("use_dummy_delegate",
                             ToolParam::Create<bool>(false));
  }

  std::vector<Flag> CreateFlags(ToolParams* params) const final;

  void LogParams(const ToolParams& params) const final;

  TfLiteDelegatePtr CreateTfLiteDelegate(const ToolParams& params) const final;

  std::string GetName() const final { return "DummyDelegate"; }
};
REGISTER_DELEGATE_PROVIDER(DummyDelegateProvider);

std::vector<Flag> DummyDelegateProvider::CreateFlags(ToolParams* params) const {
  std::vector<Flag> flags = {CreateFlag<bool>("use_dummy_delegate", params,
                                              "use the dummy delegate.")};
  return flags;
}

void DummyDelegateProvider::LogParams(const ToolParams& params) const {
  TFLITE_LOG(INFO) << "Use dummy test delegate : ["
                   << params.Get<bool>("use_dummy_delegate") << "]";
}

TfLiteDelegatePtr DummyDelegateProvider::CreateTfLiteDelegate(
    const ToolParams& params) const {
  if (params.Get<bool>("use_dummy_delegate")) {
    auto default_options = TfLiteDummyDelegateOptionsDefault();
    return TfLiteDummyDelegateCreateUnique(&default_options);
  }
  return TfLiteDelegatePtr(nullptr, [](TfLiteDelegate*) {});
}

BUILD 規則定義很重要，因為您必須確保永遠連結程式庫，且不會由最佳化工具捨棄。

#### The following are for using the dummy test delegate in TFLite tooling ####
cc_library(
    name = "dummy_delegate_provider",
    srcs = ["dummy_delegate_provider.cc"],
    copts = tflite_copts(),
    deps = [
        ":dummy_delegate",
        "//tensorflow/lite/tools/delegates:delegate_provider_hdr",
    ],
    alwayslink = 1, # This is required so the optimizer doesn't optimize the library away.
)

在 BUILD 檔案中新增這兩個包裝函式規則，以建立基準測試工具、推論工具和其他可能執行的評估工具並自行與委派代表聯絡

cc_binary(
    name = "benchmark_model_plus_dummy_delegate",
    copts = tflite_copts(),
    linkopts = task_linkopts(),
    deps = [
        ":dummy_delegate_provider",
        "//tensorflow/lite/tools/benchmark:benchmark_model_main",
    ],
)

cc_binary(
    name = "inference_diff_plus_dummy_delegate",
    copts = tflite_copts(),
    linkopts = task_linkopts(),
    deps = [
        ":dummy_delegate_provider",
        "//tensorflow/lite/tools/evaluation/tasks:task_executor_main",
        "//tensorflow/lite/tools/evaluation/tasks/inference_diff:run_eval_lib",
    ],
)

cc_binary(
    name = "imagenet_classification_eval_plus_dummy_delegate",
    copts = tflite_copts(),
    linkopts = task_linkopts(),
    deps = [
        ":dummy_delegate_provider",
        "//tensorflow/lite/tools/evaluation/tasks:task_executor_main",
        "//tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification:run_eval_lib",
    ],
)

cc_binary(
    name = "coco_object_detection_eval_plus_dummy_delegate",
    copts = tflite_copts(),
    linkopts = task_linkopts(),
    deps = [
        ":dummy_delegate_provider",
        "//tensorflow/lite/tools/evaluation/tasks:task_executor_main",
        "//tensorflow/lite/tools/evaluation/tasks/coco_object_detection:run_eval_lib",
    ],
)

您也可以將這個委派供應商插入 TFLite 核心測試，如所述請按這裡。

選項 2：使用外部委派代表

在這個替代方案中，您要先建立外部委派轉接器 external_delegate_adaptor.cc 如下所示。請注意，相較於方法 1 如先前所述。

TfLiteDelegate* CreateDummyDelegateFromOptions(char** options_keys,
                                               char** options_values,
                                               size_t num_options) {
  DummyDelegateOptions options = TfLiteDummyDelegateOptionsDefault();

  // Parse key-values options to DummyDelegateOptions.
  // You can achieve this by mimicking them as command-line flags.
  std::unique_ptr<const char*> argv =
      std::unique_ptr<const char*>(new const char*[num_options + 1]);
  constexpr char kDummyDelegateParsing[] = "dummy_delegate_parsing";
  argv.get()[0] = kDummyDelegateParsing;

  std::vector<std::string> option_args;
  option_args.reserve(num_options);
  for (int i = 0; i < num_options; ++i) {
    option_args.emplace_back("--");
    option_args.rbegin()->append(options_keys[i]);
    option_args.rbegin()->push_back('=');
    option_args.rbegin()->append(options_values[i]);
    argv.get()[i + 1] = option_args.rbegin()->c_str();
  }

  // Define command-line flags.
  // ...
  std::vector<tflite::Flag> flag_list = {
      tflite::Flag::CreateFlag(...),
      ...,
      tflite::Flag::CreateFlag(...),
  };

  int argc = num_options + 1;
  if (!tflite::Flags::Parse(&argc, argv.get(), flag_list)) {
    return nullptr;
  }

  return TfLiteDummyDelegateCreate(&options);
}

#ifdef __cplusplus
extern "C" {
#endif  // __cplusplus

// Defines two symbols that need to be exported to use the TFLite external
// delegate. See tensorflow/lite/delegates/external for details.
TFL_CAPI_EXPORT TfLiteDelegate* tflite_plugin_create_delegate(
    char** options_keys, char** options_values, size_t num_options,
    void (*report_error)(const char*)) {
  return tflite::tools::CreateDummyDelegateFromOptions(
      options_keys, options_values, num_options);
}

TFL_CAPI_EXPORT void tflite_plugin_destroy_delegate(TfLiteDelegate* delegate) {
  TfLiteDummyDelegateDelete(delegate);
}

#ifdef __cplusplus
}
#endif  // __cplusplus

現在請建立對應的 BUILD 目標以建構動態程式庫，如下所示：如下：

cc_binary(
    name = "dummy_external_delegate.so",
    srcs = [
        "external_delegate_adaptor.cc",
    ],
    linkshared = 1,
    linkstatic = 1,
    deps = [
        ":dummy_delegate",
        "//tensorflow/lite/c:common",
        "//tensorflow/lite/tools:command_line_flags",
        "//tensorflow/lite/tools:logging",
    ],
)

建立這個外部委派 .so 檔案後，即可建構二進位檔或使用預先建構的觸發條件，前提是二進位檔與這個 external_delegate_provider 它支援指令列標記請參閱這篇文章。注意：這個外部委派供應商已連結至現有委派代表測試及工具二進位檔

查看說明這裡的插圖，說明如何透過這個外部委派方法。您也可以在測試和評估工具

值得注意的是，外部委派是對應的 C++ LiteRT Python 繫結中委派的實作，如下所示請參閱這篇文章。因此，在這裡建立的動態外部委派轉接器程式庫可能可以可與 LiteRT Python API 搭配使用

資源

下載夜間預先建構的 TFLite 工具二進位檔連結

單一尺寸	ARCH	BINARY_NAME
Linux	x86_64	benchmark_model inference_diff imagenet_image_classification_eval coco_object_detection_eval
	實驗組	benchmark_model inference_diff imagenet_image_classification_eval coco_object_detection_eval
	aarch64	benchmark_model inference_diff imagenet_image_classification_eval coco_object_detection_eval
Android 版	實驗組	benchmark_model benchmark_model.apk inference_diff imagenet_image_classification_eval coco_object_detection_eval
Android 版	aarch64	benchmark_model benchmark_model.apk inference_diff imagenet_image_classification_eval coco_object_detection_eval