Gemini Deep Research พร้อมให้บริการในเวอร์ชันพรีวิวแล้วตอนนี้ โดยมีฟีเจอร์การวางแผนร่วมกัน การแสดงภาพข้อมูล การรองรับ MCP และอื่นๆ

Google uses AI technology to translate content into your preferred language. AI translations can contain errors.

การอนุมานแบบยืดหยุ่น

Gemini Flex API เป็นระดับการอนุมานที่ช่วยลดต้นทุนได้ 50% เมื่อเทียบกับอัตรามาตรฐาน แลกกับการตอบสนองที่ผันแปรและความพร้อมให้บริการอย่างเต็มที่ API นี้ออกแบบมาสำหรับภาระงานที่ยอมรับการตอบสนองที่ช้าได้ ซึ่งต้องมีการประมวลผลแบบซิงโครนัส แต่ไม่จำเป็นต้องมีประสิทธิภาพแบบเรียลไทม์เหมือน API มาตรฐาน

วิธีใช้ Flex

หากต้องการใช้ระดับ Flex ให้ระบุ service_tier เป็น flex ในเนื้อหาของคำขอ โดยค่าเริ่มต้น คำขอจะใช้ระดับมาตรฐานหากละเว้นช่องนี้

Python

from google import genai

client = genai.Client()

try:
    response = client.models.generate_content(
        model="gemini-3-flash-preview",
        contents="Analyze this dataset for trends...",
        config={"service_tier": "flex"},
    )
    print(response.text)
except Exception as e:
    print(f"Flex request failed: {e}")

JavaScript

import {GoogleGenAI} from '@google/genai';

const ai = new GoogleGenAI({});

async function main() {
  try {
    const response = await ai.models.generateContent({
      model: "gemini-3-flash-preview",
      contents: "Analyze this dataset for trends...",
      config: { serviceTier: "flex" },
    });
    console.log(response.text);
  } catch (e) {
    console.log(`Flex request failed: ${e}`);
  }
}

await main();

Go

package main

import (
    "context"
    "fmt"
    "log"
    "google.golang.org/genai"
)

func main() {
    ctx := context.Background()
    client, err := genai.NewClient(ctx, nil)
    if err != nil {
        log.Fatal(err)
    }

    result, err := client.Models.GenerateContent(
        ctx,
        "gemini-3-flash-preview",
        genai.Text("Analyze this dataset for trends..."),
        &genai.GenerateContentConfig{
            ServiceTier: "flex",
        },
    )
    if err != nil {
        log.Printf("Flex request failed: %v", err)
        return
    }
    fmt.Println(result.Text())
}

REST

curl -X POST "https://generativelanguage.googleapis.com/v1beta/models/gemini-3-flash-preview:generateContent?key=$GEMINI_API_KEY" \
-H "Content-Type: application/json" \
-d '{
  "contents": [{
    "parts":[{"text": "Summarize the latest research on quantum computing."}]
  }],
  "service_tier": "flex"
}'

วิธีการทำงานของการอนุมาน Flex

การอนุมาน Gemini Flex ช่วยลดช่องว่างระหว่าง API มาตรฐานกับ API แบบกลุ่มที่ใช้เวลาดำเนินการ 24 ชั่วโมง ของ Batch API โดยใช้ความสามารถในการประมวลผลที่ "ลดได้" ในช่วงเวลาที่ไม่ใช่ช่วงเวลาที่มีการใช้งานสูงสุด เพื่อมอบโซลูชันที่คุ้มค่าสำหรับงานเบื้องหลังและเวิร์กโฟลว์แบบลำดับ

ฟีเจอร์	พับ	รายการสำคัญ	มาตรฐาน	กลุ่ม
การกำหนดราคา	ส่วนลด 50%	สูงกว่ามาตรฐาน 75-100%	ตั๋วราคาเต็ม	ส่วนลด 50%
เวลาในการตอบสนอง	นาที (เป้าหมาย 1-15 นาที)	ต่ำ (วินาที)	วินาทีถึงนาที	สูงสุด 24 ชั่วโมง
ความเชื่อถือได้	อย่างเต็มที่ (ลดได้)	สูง (ลดไม่ได้)	สูง / สูงปานกลาง	สูง (สำหรับอัตราการส่งข้อมูล)
อินเทอร์เฟซ	แบบซิงโครนัส	แบบซิงโครนัส	แบบซิงโครนัส	แบบอะซิงโครนัส

สิทธิประโยชน์ที่สำคัญ

ความคุ้มค่า: ประหยัดค่าใช้จ่ายได้อย่างมากสำหรับการประเมินที่ไม่ใช่การใช้งานจริง, เอเจนต์เบื้องหลัง และการเพิ่มคุณค่าของข้อมูล
ความยุ่งยากต่ำ: ไม่จำเป็นต้องจัดการออบเจ็กต์แบบกลุ่ม, รหัสงาน หรือการโพล เพียงเพิ่มพารามิเตอร์เดียวลงในคำขอที่มีอยู่
เวิร์กโฟลว์แบบซิงโครนัส: เหมาะอย่างยิ่งสำหรับเชน API แบบลำดับที่คำขอถัดไปขึ้นอยู่กับเอาต์พุตของคำขอก่อนหน้า ซึ่งทำให้มีความยืดหยุ่นมากกว่า Batch สำหรับเวิร์กโฟลว์แบบเอเจนต์

กรณีการใช้งาน

การประเมินแบบออฟไลน์: การเรียกใช้การทดสอบการถดถอยหรือลีดเดอร์บอร์ด "LLM-as-a-judge"
เอเจนต์เบื้องหลัง: งานแบบลำดับ เช่น การอัปเดต CRM, การสร้างโปรไฟล์ หรือการกลั่นกรองเนื้อหาที่ยอมรับความล่าช้าได้เป็นนาที
การวิจัยที่ถูกจำกัดด้วยงบประมาณ: การทดลองทางวิชาการที่ต้องใช้โทเค็นจำนวนมากโดยมีงบประมาณจำกัด

ขีดจำกัดอัตรา

การเข้าชมการอนุมาน Flex จะนับรวมในขีดจำกัดอัตราทั่วไป และไม่ มีขีดจำกัดอัตราที่ขยายออกไปเหมือน Batch API

ความสามารถที่ลดได้

ระบบจะถือว่าการเข้าชม Flex มีลำดับความสำคัญต่ำกว่า หากมีการเข้าชมมาตรฐานเพิ่มขึ้นอย่างรวดเร็ว ระบบอาจขัดจังหวะหรือนำคำขอ Flex ออกเพื่อให้มีความสามารถสำหรับผู้ใช้ที่มีลำดับความสำคัญสูง หากต้องการการอนุมานที่มีลำดับความสำคัญสูง ให้ดู การอนุมานที่มีลำดับความสำคัญ

รหัสข้อผิดพลาด

เมื่อความสามารถ Flex ไม่พร้อมใช้งานหรือระบบมีการใช้งานหนาแน่น API จะแสดงรหัสข้อผิดพลาดมาตรฐานดังนี้

503 ไม่พร้อมให้บริการ: ระบบมีความสามารถเต็มที่ในขณะนี้
429 มีคำขอมากเกินไป: ขีดจำกัดอัตราหรือทรัพยากรหมด

ความรับผิดชอบของไคลเอ็นต์

ไม่มีการย้อนกลับฝั่งเซิร์ฟเวอร์: เพื่อป้องกันค่าใช้จ่ายที่ไม่คาดคิด ระบบจะไม่ ยกระดับคำขอ Flex เป็นระดับมาตรฐานโดยอัตโนมัติหากความสามารถ Flex เต็ม
การลองซ้ำ: คุณต้องใช้ตรรกะการลองซ้ำฝั่งไคลเอ็นต์ของคุณเองด้วย Exponential Backoff
ระยะหมดเวลา: เนื่องจากคำขอ Flex อาจอยู่ในคิว เราจึงแนะนำให้ เพิ่มระยะหมดเวลาฝั่งไคลเอ็นต์เป็น 10 นาทีขึ้นไปเพื่อหลีกเลี่ยงการ ปิดการเชื่อมต่อก่อนเวลา

ปรับกรอบเวลาหมดเวลา

คุณสามารถกำหนดค่าระยะหมดเวลาต่อคำขอสำหรับ REST API และไลบรารีของไคลเอ็นต์ รวมถึงระยะหมดเวลาส่วนกลางเมื่อใช้ไลบรารีของไคลเอ็นต์เท่านั้น

ตรวจสอบเสมอว่าระยะหมดเวลาฝั่งไคลเอ็นต์ครอบคลุมกรอบเวลาที่เซิร์ฟเวอร์รอได้ (เช่น 600 วินาทีขึ้นไปสำหรับคิวรอ Flex) SDK คาดหวังค่าระยะหมดเวลาเป็นมิลลิวินาที

ระยะหมดเวลาต่อคำขอ

Python

from google import genai

client = genai.Client()

try:
    response = client.models.generate_content(
        model="gemini-3-flash-preview",
        contents="why is the sky blue?",
        config={
            "service_tier": "flex",
            "http_options": {"timeout": 900000}
        },
    )
except Exception as e:
    print(f"Flex request failed: {e}")

# Example with streaming
try:
    response = client.models.generate_content_stream(
        model="gemini-3-flash-preview",
        contents=["List 5 ideas for a sci-fi movie."],
        config={
            "service_tier": "flex",
            "http_options": {"timeout": 60000}
        }
        # Per-request timeout for the streaming operation
    )
    for chunk in response:
        print(chunk.text, end="")

except Exception as e:
    print(f"An error occurred during streaming: {e}")

JavaScript

 import {GoogleGenAI} from '@google/genai';

 const client = new GoogleGenAI({});

 async function main() {
     try {
         const response = await client.models.generateContent({
             model: "gemini-3-flash-preview",
             contents: "why is the sky blue?",
             config: {
               serviceTier: "flex",
               httpOptions: {timeout: 900000}
             },
         });
     } catch (e) {
         console.log(`Flex request failed: ${e}`);
     }

     // Example with streaming
     try {
         const response = await client.models.generateContentStream({
             model: "gemini-3-flash-preview",
             contents: ["List 5 ideas for a sci-fi movie."],
             config: {
                 serviceTier: "flex",
                 httpOptions: {timeout: 60000}
             },
         });
         for await (const chunk of response.stream) {
             process.stdout.write(chunk.text());
         }
     } catch (e) {
         console.log(`An error occurred during streaming: ${e}`);
     }
 }

 await main();

Go

package main

import (
    "context"
    "fmt"
    "log"
    "time"

    "google.golang.org/api/iterator"
    "google.golang.org/genai"
)

func main() {
    ctx := context.Background()
    client, err := genai.NewClient(ctx, nil)
    if err != nil {
        log.Fatal(err)
    }
    defer client.Close()

    timeoutCtx, cancel := context.WithTimeout(ctx, 900*time.Second)
    defer cancel()

    _, err = client.Models.GenerateContent(
        timeoutCtx,
        "gemini-3-flash-preview",
        genai.Text("why is the sky blue?"),
        &genai.GenerateContentConfig{
            ServiceTier: "flex",
        },
    )
    if err != nil {
        fmt.Printf("Flex request failed: %v\n", err)
    }

    // Example with streaming
    streamTimeoutCtx, streamCancel := context.WithTimeout(ctx, 60*time.Second)
    defer streamCancel()

    iter := client.Models.GenerateContentStream(
        streamTimeoutCtx,
        "gemini-3-flash-preview",
        genai.Text("List 5 ideas for a sci-fi movie."),
        &genai.GenerateContentConfig{
            ServiceTier: "flex",
        },
    )
    for {
        response, err := iter.Next()
        if err == iterator.Done {
            break
        }
        if err != nil {
            fmt.Printf("An error occurred during streaming: %v\n", err)
            break
        }
        fmt.Print(response.Candidates[0].Content.Parts[0])
    }
}

REST

เมื่อทำการเรียก REST คุณสามารถควบคุมระยะหมดเวลาได้โดยใช้ส่วนหัว HTTP และตัวเลือก curl ร่วมกัน ดังนี้

ส่วนหัว X-Server-Timeout (ระยะหมดเวลาฝั่งเซิร์ฟเวอร์): ส่วนหัวนี้แนะนำระยะหมดเวลาที่ต้องการ (ค่าเริ่มต้น 600 วินาที) ให้กับเซิร์ฟเวอร์ Gemini API เซิร์ฟเวอร์จะพยายามปฏิบัติตามคำแนะนำนี้ แต่ไม่รับประกัน ค่าควรเป็นวินาที
--max-time ใน curl (ระยะหมดเวลาฝั่งไคลเอ็นต์): ตัวเลือก curl --max-time <seconds> จะกำหนดขีดจำกัดสูงสุดของเวลาทั้งหมด (เป็นวินาที) ที่ curl จะรอให้การดำเนินการทั้งหมดเสร็จสมบูรณ์ นี่เป็นมาตรการป้องกันฝั่งไคลเอ็นต์

 # Set a server timeout hint of 120 seconds and a client-side curl timeout of 125 seconds.
 curl --max-time 125 \
   -X POST "https://generativelanguage.googleapis.com/v1beta/models/gemini-3-flash-preview:generateContent?key=$GEMINI_API_KEY" \
   -H "Content-Type: application/json" \
   -H "X-Server-Timeout: 120" \
   -d '{
   "contents": [{
     "parts":[{"text": "Summarize the latest research on quantum computing."}]
   }],
   "service_tier": "flex"
 }'

ระยะหมดเวลาส่วนกลาง

หากต้องการให้การเรียก API ทั้งหมดที่ทำผ่านอินสแตนซ์ genai.Client ที่เฉพาะเจาะจง (ไลบรารีของไคลเอ็นต์เท่านั้น) มีระยะหมดเวลาเริ่มต้น คุณสามารถกำหนดค่านี้เมื่อเริ่มต้นไคลเอ็นต์โดยใช้ http_options และ genai.types.HttpOptions

Python

from google import genai
from google.genai import types

global_timeout_ms = 120000

client_with_global_timeout = genai.Client(
    http_options=types.HttpOptions(timeout=global_timeout_ms)
)

try:
    # Calling generate_content using global timeout...
    response = client_with_global_timeout.models.generate_content(
        model="gemini-3-flash-preview",
        contents="Summarize the history of AI development since 2000.",
        config={"service_tier": "flex"},
    )
    print(response.text)

    # A per-request timeout will *override* the global timeout for that specific call.
    shorter_timeout = 30000
    response = client_with_global_timeout.models.generate_content(
        model="gemini-3-flash-preview",
        contents="Provide a very brief definition of machine learning.",
        config={
            "service_tier": "flex",
            "http_options":{"timeout": shorter_timeout}
        }  # Overrides the global timeout
    )

    print(response.text)

except TimeoutError:
    print(
        f"A GenerateContent call timed out. Check if the global or per-request timeout was exceeded."
    )
except Exception as e:
    print(f"An error occurred: {e}")

JavaScript

import {GoogleGenAI} from '@google/genai';

const globalTimeoutMs = 120000;

const clientWithGlobalTimeout = new GoogleGenAI({httpOptions: {timeout: globalTimeoutMs}});

async function main() {
    try {
        // Calling generate_content using global timeout...
        const response1 = await clientWithGlobalTimeout.models.generateContent({
            model: "gemini-3-flash-preview",
            contents: "Summarize the history of AI development since 2000.",
            config: { serviceTier: "flex" },
        });
        console.log(response1.text());

        // A per-request timeout will *override* the global timeout for that specific call.
        const shorterTimeout = 30000;
        const response2 = await clientWithGlobalTimeout.models.generateContent({
            model: "gemini-3-flash-preview",
            contents: "Provide a very brief definition of machine learning.",
            config: {
                serviceTier: "flex",
                httpOptions: {timeout: shorterTimeout}
            }  // Overrides the global timeout
        });

        console.log(response2.text());

    } catch (e) {
        if (e.name === 'TimeoutError' || e.message?.includes('timeout')) {
            console.log(
                "A GenerateContent call timed out. Check if the global or per-request timeout was exceeded."
            );
        } else {
            console.log(`An error occurred: ${e}`);
        }
    }
}

await main();

Go

 package main

 import (
     "context"
     "fmt"
     "log"
     "time"

     "google.golang.org/genai"
 )

 func main() {
     ctx := context.Background()
     client, err := genai.NewClient(ctx, nil)
     if err != nil {
         log.Fatal(err)
     }
     defer client.Close()

     model := client.GenerativeModel("gemini-3-flash-preview")

     // Go uses context for timeouts, not client options.
     // Set a default timeout for requests.
     globalTimeout := 120 * time.Second
     fmt.Printf("Using default timeout of %v seconds.\n", globalTimeout.Seconds())

     fmt.Println("Calling generate_content (using default timeout)...")
     ctx1, cancel1 := context.WithTimeout(ctx, globalTimeout)
     defer cancel1()
     resp1, err := model.GenerateContent(ctx1, genai.Text("Summarize the history of AI development since 2000."), &genai.GenerateContentConfig{ServiceTier: "flex"})
     if err != nil {
         log.Printf("Request 1 failed: %v", err)
     } else {
         fmt.Println("GenerateContent 1 successful.")
         fmt.Println(resp1.Text())
     }

     // A different timeout can be used for other requests.
     shorterTimeout := 30 * time.Second
     fmt.Printf("\nCalling generate_content with a shorter timeout of %v seconds...\n", shorterTimeout.Seconds())
     ctx2, cancel2 := context.WithTimeout(ctx, shorterTimeout)
     defer cancel2()
     resp2, err := model.GenerateContent(ctx2, genai.Text("Provide a very brief definition of machine learning."), &genai.GenerateContentConfig{
         ServiceTier: "flex",
     })
     if err != nil {
         log.Printf("Request 2 failed: %v", err)
     } else {
         fmt.Println("GenerateContent 2 successful.")
         fmt.Println(resp2.Text())
     }
 }

ใช้การลองซ้ำ

เนื่องจาก Flex ลดได้และล้มเหลวด้วยข้อผิดพลาด 503 ต่อไปนี้เป็นตัวอย่างของการใช้ตรรกะการลองซ้ำ (ไม่บังคับ) เพื่อดำเนินการต่อกับคำขอที่ล้มเหลว

Python

import time
from google import genai

client = genai.Client()

def call_with_retry(max_retries=3, base_delay=5):
    for attempt in range(max_retries):
        try:
            return client.models.generate_content(
                model="gemini-3-flash-preview",
                contents="Analyze this batch statement.",
                config={"service_tier": "flex"},
            )
        except Exception as e:
            # Check for 503 Service Unavailable or 429 Rate Limits
            print(e.code)
            if attempt < max_retries - 1:
                delay = base_delay * (2 ** attempt) # Exponential Backoff
                print(f"Flex busy, retrying in {delay}s...")
                time.sleep(delay)
            else:
                # Fallback to standard on last strike (Optional)
                print("Flex exhausted, falling back to Standard...")
                return client.models.generate_content(
                    model="gemini-3-flash-preview",
                    contents="Analyze this batch statement."
                )

# Usage
response = call_with_retry()
print(response.text)

JavaScript

 import {GoogleGenAI} from '@google/genai';

 const ai = new GoogleGenAI({});

 async function sleep(ms) {
   return new Promise(resolve => setTimeout(resolve, ms));
 }

 async function callWithRetry(maxRetries = 3, baseDelay = 5) {
   for (let attempt = 0; attempt < maxRetries; attempt++) {
     try {
       console.log(`Attempt ${attempt + 1}: Calling Flex tier...`);
       const response = await ai.models.generateContent({
         model: "gemini-3-flash-preview",
         contents: "Analyze this batch statement.",
         config: { serviceTier: 'flex' },
       });
       return response;
     } catch (e) {
       if (attempt < maxRetries - 1) {
         const delay = baseDelay * (2 ** attempt);
         console.log(`Flex busy, retrying in ${delay}s...`);
         await sleep(delay * 1000);
       } else {
         console.log("Flex exhausted, falling back to Standard...");
         return await ai.models.generateContent({
           model: "gemini-3-flash-preview",
           contents: "Analyze this batch statement.",
         });
       }
     }
   }
 }

 async function main() {
     const response = await callWithRetry();
     console.log(response.text);
 }

 await main();

Go

 package main

 import (
     "context"
     "fmt"
     "log"
     "math"
     "time"

     "google.golang.org/genai"
 )

 func callWithRetry(ctx context.Context, client *genai.Client, maxRetries int, baseDelay time.Duration) (*genai.GenerateContentResponse, error) {
     modelName := "gemini-3-flash-preview"
     content := genai.Text("Analyze this batch statement.")
     flexConfig := &genai.GenerateContentConfig{
         ServiceTier: "flex",
     }

     for attempt := 0; attempt < maxRetries; attempt++ {
         log.Printf("Attempt %d: Calling Flex tier...", attempt+1)
         resp, err := client.Models.GenerateContent(ctx, modelName, content, flexConfig)
         if err == nil {
             return resp, nil
         }

         log.Printf("Attempt %d failed: %v", attempt+1, err)

         if attempt < maxRetries-1 {
             delay := time.Duration(float64(baseDelay) * math.Pow(2, float64(attempt)))
             log.Printf("Flex busy, retrying in %v...", delay)
             time.Sleep(delay)
         } else {
             log.Println("Flex exhausted, falling back to Standard...")
             return client.Models.GenerateContent(ctx, modelName, content)
         }
     }
     return nil, fmt.Errorf("retries exhausted") // Should not be reached
 }

 func main() {
     ctx := context.Background()
     client, err := genai.NewClient(ctx, nil)
     if err != nil {
         log.Fatal(err)
     }
     defer client.Close()

     resp, err := callWithRetry(ctx, client, 3, 5*time.Second)
     if err != nil {
         log.Fatalf("Failed after retries: %v", err)
     }
     fmt.Println(resp.Text())
 }

การกำหนดราคา

การอนุมาน Flex มีราคาอยู่ที่ 50% ของ API มาตรฐาน และเรียกเก็บเงินต่อโทเค็น

โมเดลที่รองรับ

โมเดลต่อไปนี้รองรับการอนุมาน Flex

โมเดล	การอนุมาน Flex
Gemini 3.1 Flash-Lite	✔️
Gemini 3.1 Flash-Lite (เวอร์ชันตัวอย่าง)	✔️
Gemini 3.1 Pro (เวอร์ชันตัวอย่าง)	✔️
Gemini 3 Flash (เวอร์ชันตัวอย่าง)	✔️
Gemini 3 Pro Image (เวอร์ชันตัวอย่าง)	✔️
Gemini 2.5 Pro	✔️
Gemini 2.5 Flash	✔️
Gemini 2.5 Flash Image	✔️
Gemini 2.5 Flash-Lite	✔️

ขั้นตอนถัดไป

อ่านเกี่ยวกับตัวเลือกการอนุมานและการเพิ่มประสิทธิภาพอื่นๆ ของ Gemini:

การอนุมานที่มีลำดับความสำคัญสำหรับเวลาในการตอบสนองต่ำมาก
Batch API สำหรับการประมวลผลแบบอะซิงโครนัสภายใน 24 ชั่วโมง
การแคชบริบท เพื่อลดต้นทุนโทเค็นอินพุต