ขอแนะนํา Google AI Edge Portal: เปรียบเทียบประสิทธิภาพ AI บนอุปกรณ์ขอบในวงกว้าง ลงชื่อสมัครใช้เพื่อขอสิทธิ์เข้าถึงในช่วงเวอร์ชันตัวอย่างก่อนเปิดตัว

Google uses AI technology to translate content into your preferred language. AI translations can contain errors.

LiteRT-LM Cross-Platform C++ API

Conversation เป็น API ระดับสูงที่แสดงถึงการสนทนาแบบเก็บสถานะเดียวกับ LLM และเป็นจุดแรกเข้าที่แนะนำสำหรับผู้ใช้ส่วนใหญ่ โดยจะจัดการ Session ภายในและจัดการ งานการประมวลผลข้อมูลที่ซับซ้อน งานเหล่านี้รวมถึงการรักษาบริบทเริ่มต้น การจัดการคำจำกัดความของเครื่องมือ การประมวลผลล่วงหน้าของข้อมูลหลายรูปแบบ และการใช้เทมเพลตพรอมต์ Jinja พร้อมการจัดรูปแบบข้อความตามบทบาท

เวิร์กโฟลว์ของ Conversation API

วงจรการใช้งาน Conversation API โดยทั่วไปมีดังนี้

สร้าง Engine: เริ่มต้น Engine รายการเดียวด้วยเส้นทางและค่ากำหนดของโมเดล นี่คือออบเจ็กต์ ที่มีน้ำหนักมากซึ่งเก็บน้ำหนักของโมเดล
สร้าง Conversation: ใช้ Engine เพื่อสร้างออบเจ็กต์ Conversationแบบเบาอย่างน้อย 1 รายการ
ส่งข้อความ: ใช้เมธอดของออบเจ็กต์ Conversation เพื่อส่งข้อความไปยัง LLM และรับคำตอบ ซึ่งจะช่วยให้ โต้ตอบได้เหมือนแชท

ด้านล่างนี้เป็นวิธีที่ง่ายที่สุดในการส่งข้อความและรับคำตอบจากโมเดล ขอแนะนำให้ใช้ในกรณีการใช้งานส่วนใหญ่ ซึ่งจะคล้ายกับ Gemini Chat API

SendMessage: การเรียกใช้ที่บล็อกซึ่งรับข้อมูลจากผู้ใช้และแสดงผลการตอบกลับของโมเดลที่สมบูรณ์
SendMessageAsync: การเรียกที่ไม่บล็อกซึ่งสตรีมการตอบกลับของโมเดลกลับทีละโทเค็น ผ่านการเรียกกลับ

ตัวอย่างข้อมูลโค้ดมีดังนี้

เนื้อหาที่เป็นข้อความเท่านั้น

#include "runtime/engine/engine.h"

// ...

// 1. Define model assets and engine settings.
auto model_assets = ModelAssets::Create(model_path);
CHECK_OK(model_assets);

auto engine_settings = EngineSettings::CreateDefault(
    model_assets,
    /*backend=*/litert::lm::Backend::CPU);

// 2. Create the main Engine object.
absl::StatusOr<std::unique_ptr<Engine>> engine = Engine::CreateEngine(engine_settings);
CHECK_OK(engine);

// 3. Create a Conversation
auto conversation_config = ConversationConfig::CreateDefault(**engine);
CHECK_OK(conversation_config)
absl::StatusOr<std::unique_ptr<Conversation>> conversation = Conversation::Create(**engine, *conversation_config);
CHECK_OK(conversation);

// 4. Send message to the LLM with blocking call.
absl::StatusOr<Message> model_message = (*conversation)->SendMessage(
    JsonMessage{
        {"role", "user"},
        {"content", "What is the tallest building in the world?"}
    });
CHECK_OK(model_message);

// 5. Print the model message.
std::cout << *model_message << std::endl;

// 6. Send message to the LLM with asynchronous call
// where CreatePrintMessageCallback is a users implemented callback that would
// process the message once a chunk of message output is received.
std::stringstream captured_output;
(*conversation)->SendMessageAsync(
    JsonMessage{
        {"role", "user"},
        {"content", "What is the tallest building in the world?"}
    },
    CreatePrintMessageCallback(std::stringstream& captured_output)
);
// Wait until asynchronous finish or timeout.
*engine->WaitUntilDone(absl::Seconds(10));

ตัวอย่าง CreatePrintMessageCallback

absl::AnyInvocable<void(absl::StatusOr<Message>)> CreatePrintMessageCallback(
    std::stringstream& captured_output) {
  return [&captured_output](absl::StatusOr<Message> message) {
    if (!message.ok()) {
      std::cout << message.status().message() << std::endl;
      return;
    }
    if (auto json_message = std::get_if<JsonMessage>(&(*message))) {
      if (json_message->is_null()) {
        std::cout << std::endl << std::flush;
        return;
      }
      ABSL_CHECK_OK(PrintJsonMessage(*json_message, captured_output,
                                     /*streaming=*/true));
    }
  };
}

absl::Status PrintJsonMessage(const JsonMessage& message,
                              std::stringstream& captured_output,
                              bool streaming = false) {
  if (message["content"].is_array()) {
    for (const auto& content : message["content"]) {
      if (content["type"] == "text") {
        captured_output << content["text"].get<std::string>();
        std::cout << content["text"].get<std::string>();
      }
    }
    if (!streaming) {
      captured_output << std::endl << std::flush;
      std::cout << std::endl << std::flush;
    } else {
      captured_output << std::flush;
      std::cout << std::flush;
    }
  } else if (message["content"]["text"].is_string()) {
    if (!streaming) {
      captured_output << message["content"]["text"].get<std::string>()
                      << std::endl
                      << std::flush;
      std::cout << message["content"]["text"].get<std::string>() << std::endl
                << std::flush;
    } else {
      captured_output << message["content"]["text"].get<std::string>()
                      << std::flush;
      std::cout << message["content"]["text"].get<std::string>() << std::flush;
    }
  } else {
    return absl::InvalidArgumentError("Invalid message: " + message.dump());
  }
  return absl::OkStatus();
}

🔴 ใหม่: การคาดการณ์หลายโทเค็น (MTP)

การคาดการณ์หลายโทเค็น (MTP) คือการเพิ่มประสิทธิภาพที่ช่วย เร่งความเร็วในการถอดรหัสได้อย่างมาก เราขอแนะนำให้ใช้ MTP สำหรับงานทั้งหมดในแบ็กเอนด์ GPU

หากต้องการใช้ MTP คุณต้องเปิดใช้การถอดรหัสแบบคาดเดาในการตั้งค่าขั้นสูงของการกำหนดค่าเครื่องมือ

// 1. Define model assets and engine settings.
auto model_assets = ModelAssets::Create(model_path);
CHECK_OK(model_assets);

auto engine_settings = EngineSettings::CreateDefault(
    model_assets,
    /*backend=*/litert::lm::Backend::GPU);
CHECK_OK(engine_settings);

// 2. Enable MTP via speculative decoding in advanced settings.
litert::lm::AdvancedSettings advanced_settings;
advanced_settings.enable_speculative_decoding = true;
engine_settings->GetMutableMainExecutorSettings().SetAdvancedSettings(
    advanced_settings);

// 3. Create the main Engine object.
absl::StatusOr<std::unique_ptr<Engine>> engine = Engine::CreateEngine(
    *engine_settings);
CHECK_OK(engine);

// The same steps to create Conversation and send messages as above...

เนื้อหาข้อมูลหลายรูปแบบ

// To use multimodality, the engine must be created with vision and audio
// backend depending on the multimodality to be used
auto engine_settings = EngineSettings::CreateDefault(
    model_assets,
    /*backend=*/litert::lm::Backend::CPU,
    /*vision_backend*/litert::lm::Backend::GPU,
    /*audio_backend*/litert::lm::Backend::CPU,
);

// The same steps to create Engine and Conversation as above...

// Send message to the LLM with image data.
absl::StatusOr<Message> model_message = (*conversation)->SendMessage(
    JsonMessage{
        {"role", "user"},
        {"content", { // Now content must be an array.
          {
            {"type", "text"}, {"text", "Describe the following image: "}
          },
          {
            {"type", "image"}, {"path", "/file/path/to/image.jpg"}
          }
        }},
    });
CHECK_OK(model_message);

// Print the model message.
std::cout << *model_message << std::endl;

// Send message to the LLM with audio data.
model_message = (*conversation)->SendMessage(
    JsonMessage{
        {"role", "user"},
        {"content", { // Now content must be an array.
          {
            {"type", "text"}, {"text", "Transcribe the audio: "}
          },
          {
            {"type", "audio"}, {"path", "/file/path/to/audio.wav"}
          }
        }},
    });
CHECK_OK(model_message);

// Print the model message.
std::cout << *model_message << std::endl;

// The content can include multiple image or audio data.
model_message = (*conversation)->SendMessage(
    JsonMessage{
        {"role", "user"},
        {"content", { // Now content must be an array.
          {
            {"type", "text"}, {"text", "First briefly describe the two images "}
          },
          {
            {"type", "image"}, {"path", "/file/path/to/image1.jpg"}
          },
          {
            {"type", "text"}, {"text", "and "}
          },
          {
            {"type", "image"}, {"path", "/file/path/to/image2.jpg"}
          },
          {
            {"type", "text"}, {"text", " then transcribe the content in the audio"}
          },
          {
            {"type", "audio"}, {"path", "/file/path/to/audio.wav"}
          }
        }},
    });
CHECK_OK(model_message);

// Print the model message.
std::cout << *model_message << std::endl;

ใช้การสนทนากับเครื่องมือ

ดูรายละเอียดการใช้งานเครื่องมือด้วย Conversation API ได้ที่การใช้งานขั้นสูง

คอมโพเนนต์ในการสนทนา

Conversation อาจถือเป็นผู้รับมอบสิทธิ์สำหรับผู้ใช้ในการ ดูแลรักษาSession และการประมวลผลข้อมูลที่ซับซ้อนก่อนส่ง ข้อมูลไปยัง Session

ประเภท I/O

รูปแบบอินพุตและเอาต์พุตหลักสำหรับ Conversation API คือ Message ปัจจุบันฟีเจอร์นี้ได้รับการติดตั้งใช้งานเป็น JsonMessage ซึ่งเป็นนามแฝงของประเภทสำหรับ ordered_json ซึ่งเป็นโครงสร้างข้อมูลแบบคีย์-ค่าที่ซ้อนกันแบบยืดหยุ่น

API ของ Conversation จะทำงานแบบข้อความเข้า-ออก ซึ่งจำลองประสบการณ์การแชททั่วไป ความยืดหยุ่นของ Message ช่วยให้ผู้ใช้รวมฟิลด์ที่กำหนดเองได้ตามต้องการโดย เทมเพลตพรอมต์หรือโมเดล LLM ที่เฉพาะเจาะจง ซึ่งช่วยให้ LiteRT-LM รองรับโมเดลได้หลากหลาย

แม้จะไม่มีมาตรฐานที่ตายตัว แต่เทมเพลตและโมเดลพรอมต์ส่วนใหญ่ คาดหวังให้ Message เป็นไปตามรูปแบบที่คล้ายกับที่ใช้ในเนื้อหา Gemini API หรือโครงสร้างข้อความของ OpenAI

Message ต้องมี role ซึ่งแสดงถึงผู้ที่ส่งข้อความ content อาจเป็นสตริงข้อความที่เรียบง่าย

{
  "role": "model", // Represent who the message is sent from.
  "content": "Hello World!" // Naive text only content.
}

สําหรับอินพุตข้อมูลหลายรูปแบบ content คือรายการของ part อีกครั้งที่ part ไม่ใช่ โครงสร้างข้อมูลที่กำหนดไว้ล่วงหน้า แต่เป็นประเภทข้อมูลคู่คีย์-ค่าที่เรียงลำดับ ฟิลด์ที่เฉพาะเจาะจงจะขึ้นอยู่กับ สิ่งที่เทมเพลตพรอมต์และโมเดลคาดหวัง

{
  "role": "user",
  "content": [  // Multimodal content.
    // Now the content is composed of parts
    {
      "type": "text",
      "text": "Describe the image in details: "
    },
    {
      "type": "image",
      "path": "/path/to/image.jpg"
    }
  ]
}

สำหรับ part แบบหลายรูปแบบ เราจะรองรับรูปแบบต่อไปนี้ที่จัดการโดย data_utils.h

{
  "type": "text",
  "text": "this is a text"
}

{
  "type": "image",
  "path": "/path/to/image.jpg"
}

{
  "type": "image",
  "blob": "base64 encoded image bytes as string",
}

{
  "type": "audio",
  "path": "/path/to/audio.wav"
}

{
  "type": "audio",
  "blob": "base64 encoded audio bytes as string",
}

Prompt Template

PromptTemplate ได้รับการติดตั้งใช้งานเป็น Wrapper แบบบางรอบ Minja เพื่อให้โมเดลตัวแปรมีความยืดหยุ่น Minja เป็นการใช้งาน Jinja template engine ใน C++ ซึ่ง ประมวลผลอินพุต JSON เพื่อสร้างพรอมต์ที่จัดรูปแบบ

เครื่องมือเทมเพลต Jinja เป็นรูปแบบที่ใช้กันอย่างแพร่หลายสำหรับเทมเพลตพรอมต์ LLM ลองดูตัวอย่างต่อไปนี้

รูปแบบเครื่องมือเทมเพลต Jinja ควรตรงกับโครงสร้าง ที่โมเดลที่ปรับแต่งตามคำสั่งคาดหวังอย่างเคร่งครัด โดยปกติแล้ว การเผยแพร่โมเดลจะมี เทมเพลต Jinja มาตรฐานเพื่อให้มั่นใจว่ามีการใช้โมเดลอย่างเหมาะสม

ไฟล์โมเดลจะให้ข้อมูลเมตาของเทมเพลต Jinja ที่โมเดลใช้

หมายเหตุ: การเปลี่ยนแปลงพรอมต์เพียงเล็กน้อยเนื่องจากการจัดรูปแบบไม่ถูกต้องอาจทำให้โมเดลเสื่อมถอยลงอย่างมาก ตามที่รายงานใน Quantifying Language Models' Sensitivity to Spurious Features in Prompt Design or: How I learned to start worrying about prompt formatting

บทนำ

Preface กำหนดบริบทเริ่มต้นสำหรับการสนทนา ซึ่งอาจรวมถึงข้อความเริ่มต้น คำจำกัดความของเครื่องมือ และข้อมูลพื้นฐานอื่นๆ ที่ LLM ต้องใช้เพื่อเริ่มการโต้ตอบ ซึ่งจะทำให้มีฟังก์ชันการทำงานคล้ายกับ Gemini API system instruction และ Gemini API Tools

คำนำประกอบด้วยช่องต่อไปนี้

messages ข้อความในคำนำ ข้อความดังกล่าวเป็นพื้นฐานเริ่มต้น ของการสนทนา เช่น ข้อความอาจเป็น ประวัติการสนทนา คำสั่งของระบบวิศวกรรมพรอมต์ ตัวอย่างแบบ Few-Shot เป็นต้น
tools เครื่องมือที่โมเดลใช้ในการสนทนาได้ รูปแบบของเครื่องมือ ก็ไม่ได้กำหนดไว้เช่นกัน แต่ส่วนใหญ่จะใช้รูปแบบเดียวกับ Gemini API FunctionDeclaration
extra_context บริบทเพิ่มเติมที่ช่วยให้โมเดลขยายความสามารถเพื่อ ปรับแต่งข้อมูลบริบทที่จำเป็นในการเริ่มการสนทนา ตัวอย่างเช่น
- enable_thinking สำหรับโมเดลที่มีโหมดการคิด เช่น Qwen3 หรือ SmolLM3-3B

ตัวอย่างคำนำเพื่อระบุคำสั่งเริ่มต้นของระบบ เครื่องมือ และปิดใช้ โหมดความคิด

Preface preface = JsonPreface({
  .messages = {
      {"role", "system"},
      {"content", {"You are a model that can do function calling."}}
    },
  .tools = {
    {
      {"name", "get_weather"},
      {"description", "Returns the weather for a given location."},
      {"parameters", {
        {"type", "object"},
        {"properties", {
          {"location", {
            {"type", "string"},
            {"description", "The location to get the weather for."}
          }}
        }},
        {"required", {"location"}}
      }}
    },
    {
      {"name", "get_stock_price"},
      {"description", "Returns the stock price for a given stock symbol."},
      {"parameters", {
        {"type", "object"},
        {"properties", {
          {"stock_symbol", {
            {"type", "string"},
            {"description", "The stock symbol to get the price for."}
          }}
        }},
        {"required", {"stock_symbol"}}
      }}
    }
  },
  .extra_context = {
    {"enable_thinking": false}
  }
});

ประวัติ

การสนทนาจะเก็บรายการข้อความ ทั้งหมดที่แลกเปลี่ยนภายในเซสชัน ประวัติการสนทนานี้มีความสําคัญอย่างยิ่งต่อการแสดงผลเทมเพลตพรอมต์ เนื่องจากโดยปกติแล้วเทมเพลตพรอมต์ Jinja จะต้องใช้ประวัติการสนทนาทั้งหมด เพื่อสร้างพรอมต์ที่ถูกต้องสําหรับ LLM

อย่างไรก็ตาม Session ของ LiteRT-LM เป็นแบบเก็บสถานะ ซึ่งหมายความว่าระบบจะประมวลผล อินพุตแบบเพิ่มทีละรายการ การสนทนาจะสร้างพรอมต์ส่วนเพิ่มที่จำเป็นโดยการแสดงผลเทมเพลตพรอมต์ 2 ครั้ง ได้แก่ ครั้งหนึ่งพร้อมประวัติจนถึงรอบก่อนหน้า และอีกครั้งหนึ่งรวมถึงข้อความปัจจุบัน เพื่อลดช่องว่างนี้ การเปรียบเทียบพรอมต์ที่แสดงผล 2 รายการนี้จะดึงส่วนใหม่เพื่อส่งไปยังเซสชัน

ConversationConfig

ConversationConfig ใช้เพื่อเริ่มต้นอินสแตนซ์ Conversation คุณสร้างการกำหนดค่านี้ได้ 2 วิธี ดังนี้

จาก Engine: วิธีนี้ใช้SessionConfigเริ่มต้นที่เชื่อมโยงกับเครื่องมือ
จาก SessionConfig ที่เฉพาะเจาะจง: วิธีนี้ช่วยให้ควบคุมการตั้งค่าเซสชันได้ละเอียดมากขึ้น

นอกเหนือจากการตั้งค่าเซสชันแล้ว คุณยังปรับแต่งลักษณะการทำงานของ Conversation เพิ่มเติมได้ภายใน ConversationConfig ซึ่งรวมถึงเนื้อหาต่อไปนี้

การระบุPreface
เขียนทับ PromptTemplate เริ่มต้น
เขียนทับ DataProcessorConfig เริ่มต้น

การเขียนทับเหล่านี้มีประโยชน์อย่างยิ่งสำหรับโมเดลที่ปรับแต่งอย่างละเอียด ซึ่งอาจต้องมีการกำหนดค่าหรือเทมเพลตพรอมต์ที่แตกต่างจากโมเดลพื้นฐานที่ได้มา

MessageCallback

MessageCallback คือฟังก์ชัน Callback ที่ผู้ใช้ควรใช้เมื่อใช้เมธอด SendMessageAsync แบบอะซิงโครนัส

ลายเซ็นการเรียกกลับคือ absl::AnyInvocable<void(absl::StatusOr<Message>)> ระบบจะเรียกใช้ฟังก์ชันนี้ภายใต้เงื่อนไขต่อไปนี้

เมื่อได้รับMessageก้อนใหม่จากโมเดล
หากเกิดข้อผิดพลาดระหว่างการประมวลผลข้อความของ LiteRT-LM
เมื่อการอนุมานของ LLM เสร็จสมบูรณ์ ระบบจะทริกเกอร์การเรียกกลับด้วย Message ที่ว่างเปล่า (เช่น JsonMessage()) เพื่อส่งสัญญาณว่าสิ้นสุด การตอบกลับแล้ว

ดูตัวอย่างการใช้งานได้ที่การเรียกแบบอะซิงโครนัสในขั้นตอนที่ 6

หมายเหตุ: Messageที่ได้รับจากฟังก์ชันเรียกกลับจะมีเฉพาะเอาต์พุตของโมเดลส่วนล่าสุด ไม่ใช่ประวัติข้อความทั้งหมด

เช่น หากการตอบกลับของโมเดลที่สมบูรณ์ซึ่งคาดหวังจากการเรียกใช้การบล็อก SendMessage คือ

{
  "role": "model",
  "content": [
    "type": "text",
    "text": "Hello World!"
  ]
}

ระบบอาจเรียกใช้การเรียกกลับใน SendMessageAsync หลายครั้ง โดยแต่ละครั้งจะมีข้อความส่วนถัดไป

// 1st Message
{
  "role": "model",
  "content": [
    "type": "text",
    "text": "He"
  ]
}

// 2nd Message
{
  "role": "model",
  "content": [
    "type": "text",
    "text": "llo"
  ]
}

// 3rd Message
{
  "role": "model",
  "content": [
    "type": "text",
    "text": " Wo"
  ]
}

// 4th Message
{
  "role": "model",
  "content": [
    "type": "text",
    "text": "rl"
  ]
}

// 5th Message
{
  "role": "model",
  "content": [
    "type": "text",
    "text": "d!"
  ]
}

ผู้ใช้มีหน้าที่รวบรวมก้อนข้อมูลเหล่านี้หากจำเป็นต้องมีการตอบกลับที่สมบูรณ์ระหว่างสตรีมแบบไม่พร้อมกัน หรือคุณจะดูคำตอบแบบเต็มได้ในรายการสุดท้ายของ History เมื่อการเรียกแบบอะซิงโครนัสเสร็จสมบูรณ์

การใช้งานขั้นสูง

การถอดรหัสแบบจำกัด

LiteRT-LM รองรับการถอดรหัสแบบจำกัด ซึ่งช่วยให้คุณบังคับใช้โครงสร้างที่เฉพาะเจาะจงกับเอาต์พุตของโมเดลได้ เช่น สคีมา JSON, รูปแบบ Regex หรือกฎไวยากรณ์

หากต้องการเปิดใช้ ให้ตั้งค่า EnableConstrainedDecoding(true) ใน ConversationConfig และ ระบุ ConstraintProviderConfig (เช่น LlGuidanceConfig สำหรับ การรองรับนิพจน์ทั่วไป/JSON/ไวยากรณ์) จากนั้นส่งข้อจำกัดผ่าน OptionalArgs ใน SendMessage

ตัวอย่าง: ข้อจำกัดนิพจน์ทั่วไป

LlGuidanceConstraintArg constraint_arg;
constraint_arg.constraint_type = LlgConstraintType::kRegex;
constraint_arg.constraint_string = "a+b+"; // Force output to match this regex

auto response = conversation->SendMessage(
    user_message,
    {.decoding_constraint = constraint_arg}
);

ดูรายละเอียดทั้งหมด รวมถึงสคีมา JSON และการรองรับไวยากรณ์ Lark ได้ที่เอกสารประกอบการถอดรหัสแบบจำกัด

การใช้เครื่องมือ

การเรียกใช้เครื่องมือช่วยให้ LLM สามารถขอให้ดำเนินการฟังก์ชันฝั่งไคลเอ็นต์ได้ คุณ กำหนดเครื่องมือในPrefaceของการสนทนาโดยระบุชื่อเครื่องมือ เมื่อ โมเดลแสดงผลการเรียกใช้เครื่องมือ คุณจะบันทึกการเรียกใช้เครื่องมือนั้น เรียกใช้ฟังก์ชันที่เกี่ยวข้อง ในแอปพลิเคชัน และส่งคืนผลลัพธ์ไปยังโมเดล

ขั้นตอนการทำงานระดับสูง:

ประกาศเครื่องมือ: กำหนดเครื่องมือ (ชื่อ คำอธิบาย พารามิเตอร์) ใน Preface JSON
ตรวจหาการโทร: ตรวจสอบ model_message["tool_calls"] ในการตอบกลับ
ดำเนินการ: เรียกใช้ตรรกะของแอปพลิเคชันสำหรับเครื่องมือที่ขอ
ตอบกลับ: ส่งข้อความพร้อม role: "tool" ที่มีเอาต์พุตของเครื่องมือ กลับไปยังโมเดล

ดูรายละเอียดทั้งหมดและตัวอย่างลูปแชทที่สมบูรณ์ได้ในเอกสารประกอบเกี่ยวกับการใช้เครื่องมือ