Veo 3.1 พร้อมให้ใช้งานแล้ว อ่านเกี่ยวกับโมเดลใหม่และฟีเจอร์ต่างๆ ได้ใน บล็อกโพสต์และ เอกสารประกอบ

หน้านี้ได้รับการแปลโดย Cloud Translation API

ความเข้าใจเกี่ยวกับเสียง

Gemini สามารถวิเคราะห์และทำความเข้าใจอินพุตเสียง ซึ่งเปิดโอกาสให้ใช้กรณีการใช้งานต่อไปนี้

อธิบาย สรุป หรือตอบคำถามเกี่ยวกับเนื้อหาเสียง
ระบุข้อความถอดเสียง
วิเคราะห์ส่วนของเสียงที่เฉพาะเจาะจง

คู่มือนี้จะแสดงวิธีใช้ Gemini API เพื่อสร้างคำตอบแบบข้อความสำหรับอินพุตเสียง

ก่อนเริ่มต้น

ก่อนเรียกใช้ Gemini API โปรดตรวจสอบว่าคุณได้ติดตั้ง SDK ที่ต้องการ รวมถึงกําหนดค่าคีย์ Gemini API ให้พร้อมใช้งานแล้ว

อินพุตเสียง

คุณส่งข้อมูลเสียงไปยัง Gemini ได้ดังนี้

อัปโหลดไฟล์เสียงก่อนส่งคำขอไปที่ generateContent
ส่งข้อมูลเสียงในบรรทัดพร้อมกับคำขอไปที่ generateContent

อัปโหลดไฟล์เสียง

คุณสามารถใช้ Files API เพื่ออัปโหลดไฟล์เสียงได้ ใช้ Files API เสมอเมื่อคำขอทั้งหมด (รวมถึงไฟล์ ข้อความ พรอมต์ คำสั่งของระบบ ฯลฯ) มีขนาดมากกว่า 20 MB

โค้ดต่อไปนี้จะอัปโหลดไฟล์เสียง แล้วใช้ไฟล์ในการเรียกใช้ generateContent

Python

from google import genai

client = genai.Client()

myfile = client.files.upload(file="path/to/sample.mp3")

response = client.models.generate_content(
    model="gemini-2.5-flash", contents=["Describe this audio clip", myfile]
)

print(response.text)

JavaScript

import {
  GoogleGenAI,
  createUserContent,
  createPartFromUri,
} from "@google/genai";

const ai = new GoogleGenAI({});

async function main() {
  const myfile = await ai.files.upload({
    file: "path/to/sample.mp3",
    config: { mimeType: "audio/mp3" },
  });

  const response = await ai.models.generateContent({
    model: "gemini-2.5-flash",
    contents: createUserContent([
      createPartFromUri(myfile.uri, myfile.mimeType),
      "Describe this audio clip",
    ]),
  });
  console.log(response.text);
}

await main();

Go

package main

import (
  "context"
  "fmt"
  "os"
  "google.golang.org/genai"
)

func main() {
  ctx := context.Background()
  client, err := genai.NewClient(ctx, nil)
  if err != nil {
      log.Fatal(err)
  }

  localAudioPath := "/path/to/sample.mp3"
  uploadedFile, _ := client.Files.UploadFromPath(
      ctx,
      localAudioPath,
      nil,
  )

  parts := []*genai.Part{
      genai.NewPartFromText("Describe this audio clip"),
      genai.NewPartFromURI(uploadedFile.URI, uploadedFile.MIMEType),
  }
  contents := []*genai.Content{
      genai.NewContentFromParts(parts, genai.RoleUser),
  }

  result, _ := client.Models.GenerateContent(
      ctx,
      "gemini-2.5-flash",
      contents,
      nil,
  )

  fmt.Println(result.Text())
}

REST

AUDIO_PATH="path/to/sample.mp3"
MIME_TYPE=$(file -b --mime-type "${AUDIO_PATH}")
NUM_BYTES=$(wc -c < "${AUDIO_PATH}")
DISPLAY_NAME=AUDIO

tmp_header_file=upload-header.tmp

# Initial resumable request defining metadata.
# The upload url is in the response headers dump them to a file.
curl "https://generativelanguage.googleapis.com/upload/v1beta/files" \
  -H "x-goog-api-key: $GEMINI_API_KEY" \
  -D upload-header.tmp \
  -H "X-Goog-Upload-Protocol: resumable" \
  -H "X-Goog-Upload-Command: start" \
  -H "X-Goog-Upload-Header-Content-Length: ${NUM_BYTES}" \
  -H "X-Goog-Upload-Header-Content-Type: ${MIME_TYPE}" \
  -H "Content-Type: application/json" \
  -d "{'file': {'display_name': '${DISPLAY_NAME}'}}" 2> /dev/null

upload_url=$(grep -i "x-goog-upload-url: " "${tmp_header_file}" | cut -d" " -f2 | tr -d "\r")
rm "${tmp_header_file}"

# Upload the actual bytes.
curl "${upload_url}" \
  -H "Content-Length: ${NUM_BYTES}" \
  -H "X-Goog-Upload-Offset: 0" \
  -H "X-Goog-Upload-Command: upload, finalize" \
  --data-binary "@${AUDIO_PATH}" 2> /dev/null > file_info.json

file_uri=$(jq ".file.uri" file_info.json)
echo file_uri=$file_uri

# Now generate content using that file
curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \
    -H "x-goog-api-key: $GEMINI_API_KEY" \
    -H 'Content-Type: application/json' \
    -X POST \
    -d '{
      "contents": [{
        "parts":[
          {"text": "Describe this audio clip"},
          {"file_data":{"mime_type": "${MIME_TYPE}", "file_uri": '$file_uri'}}]
        }]
      }' 2> /dev/null > response.json

cat response.json
echo

jq ".candidates[].content.parts[].text" response.json

ดูข้อมูลเพิ่มเติมเกี่ยวกับการทำงานกับไฟล์สื่อได้ที่ Files API

ส่งข้อมูลเสียงในบรรทัด

คุณสามารถส่งข้อมูลเสียงในบรรทัดในคำขอไปยัง generateContent แทนการอัปโหลดไฟล์เสียงได้ ดังนี้

Python

from google import genai
from google.genai import types

with open('path/to/small-sample.mp3', 'rb') as f:
    audio_bytes = f.read()

client = genai.Client()
response = client.models.generate_content(
  model='gemini-2.5-flash',
  contents=[
    'Describe this audio clip',
    types.Part.from_bytes(
      data=audio_bytes,
      mime_type='audio/mp3',
    )
  ]
)

print(response.text)

JavaScript

import { GoogleGenAI } from "@google/genai";
import * as fs from "node:fs";

const ai = new GoogleGenAI({});
const base64AudioFile = fs.readFileSync("path/to/small-sample.mp3", {
  encoding: "base64",
});

const contents = [
  { text: "Please summarize the audio." },
  {
    inlineData: {
      mimeType: "audio/mp3",
      data: base64AudioFile,
    },
  },
];

const response = await ai.models.generateContent({
  model: "gemini-2.5-flash",
  contents: contents,
});
console.log(response.text);

Go

package main

import (
  "context"
  "fmt"
  "os"
  "google.golang.org/genai"
)

func main() {
  ctx := context.Background()
  client, err := genai.NewClient(ctx, nil)
  if err != nil {
      log.Fatal(err)
  }

  audioBytes, _ := os.ReadFile("/path/to/small-sample.mp3")

  parts := []*genai.Part{
      genai.NewPartFromText("Describe this audio clip"),
    &genai.Part{
      InlineData: &genai.Blob{
        MIMEType: "audio/mp3",
        Data:     audioBytes,
      },
    },
  }
  contents := []*genai.Content{
      genai.NewContentFromParts(parts, genai.RoleUser),
  }

  result, _ := client.Models.GenerateContent(
      ctx,
      "gemini-2.5-flash",
      contents,
      nil,
  )

  fmt.Println(result.Text())
}

สิ่งที่ควรคำนึงถึงเกี่ยวกับข้อมูลเสียงในบรรทัดมีดังนี้

คำขอมีขนาดสูงสุด 20 MB ซึ่งรวมถึงข้อความแจ้ง วิธีการของระบบ และไฟล์ที่ระบุในบรรทัด หากขนาดไฟล์จะทำให้ขนาดคำขอทั้งหมดเกิน 20 MB ให้ใช้ Files API เพื่ออัปโหลดไฟล์เสียงเพื่อใช้ในคำขอ
หากใช้ตัวอย่างเสียงหลายครั้ง การอัปโหลดไฟล์เสียงจะมีประสิทธิภาพมากกว่า

ดูข้อความถอดเสียง

หากต้องการดูข้อความถอดเสียงของข้อมูลเสียง ให้ถามในพรอมต์ดังนี้

Python

from google import genai

client = genai.Client()
myfile = client.files.upload(file='path/to/sample.mp3')
prompt = 'Generate a transcript of the speech.'

response = client.models.generate_content(
  model='gemini-2.5-flash',
  contents=[prompt, myfile]
)

print(response.text)

JavaScript

import {
  GoogleGenAI,
  createUserContent,
  createPartFromUri,
} from "@google/genai";

const ai = new GoogleGenAI({});
const myfile = await ai.files.upload({
  file: "path/to/sample.mp3",
  config: { mimeType: "audio/mpeg" },
});

const result = await ai.models.generateContent({
  model: "gemini-2.5-flash",
  contents: createUserContent([
    createPartFromUri(myfile.uri, myfile.mimeType),
    "Generate a transcript of the speech.",
  ]),
});
console.log("result.text=", result.text);

Go

package main

import (
  "context"
  "fmt"
  "os"
  "google.golang.org/genai"
)

func main() {
  ctx := context.Background()
  client, err := genai.NewClient(ctx, nil)
  if err != nil {
      log.Fatal(err)
  }

  localAudioPath := "/path/to/sample.mp3"
  uploadedFile, _ := client.Files.UploadFromPath(
      ctx,
      localAudioPath,
      nil,
  )

  parts := []*genai.Part{
      genai.NewPartFromText("Generate a transcript of the speech."),
      genai.NewPartFromURI(uploadedFile.URI, uploadedFile.MIMEType),
  }
  contents := []*genai.Content{
      genai.NewContentFromParts(parts, genai.RoleUser),
  }

  result, _ := client.Models.GenerateContent(
      ctx,
      "gemini-2.5-flash",
      contents,
      nil,
  )

  fmt.Println(result.Text())
}

ดูการประทับเวลา

คุณสามารถอ้างอิงส่วนต่างๆ ของไฟล์เสียงได้โดยใช้การประทับเวลาในรูปแบบ MM:SS ตัวอย่างเช่น พรอมต์ต่อไปนี้จะขอข้อความถอดเสียงที่

เริ่มที่ 2 นาที 30 วินาทีจากต้นไฟล์
สิ้นสุดที่ 3 นาที 29 วินาทีนับจากต้นไฟล์

Python

# Create a prompt containing timestamps.
prompt = "Provide a transcript of the speech from 02:30 to 03:29."

JavaScript

// Create a prompt containing timestamps.
const prompt = "Provide a transcript of the speech from 02:30 to 03:29."

Go

package main

import (
  "context"
  "fmt"
  "os"
  "google.golang.org/genai"
)

func main() {
  ctx := context.Background()
  client, err := genai.NewClient(ctx, nil)
  if err != nil {
      log.Fatal(err)
  }

  localAudioPath := "/path/to/sample.mp3"
  uploadedFile, _ := client.Files.UploadFromPath(
      ctx,
      localAudioPath,
      nil,
  )

  parts := []*genai.Part{
      genai.NewPartFromText("Provide a transcript of the speech " +
                            "between the timestamps 02:30 and 03:29."),
      genai.NewPartFromURI(uploadedFile.URI, uploadedFile.MIMEType),
  }
  contents := []*genai.Content{
      genai.NewContentFromParts(parts, genai.RoleUser),
  }

  result, _ := client.Models.GenerateContent(
      ctx,
      "gemini-2.5-flash",
      contents,
      nil,
  )

  fmt.Println(result.Text())
}

นับโทเค็น

เรียกใช้เมธอด countTokens เพื่อรับจํานวนโทเค็นในไฟล์เสียง เช่น

Python

from google import genai

client = genai.Client()
response = client.models.count_tokens(
  model='gemini-2.5-flash',
  contents=[myfile]
)

print(response)

JavaScript

import {
  GoogleGenAI,
  createUserContent,
  createPartFromUri,
} from "@google/genai";

const ai = new GoogleGenAI({});
const myfile = await ai.files.upload({
  file: "path/to/sample.mp3",
  config: { mimeType: "audio/mpeg" },
});

const countTokensResponse = await ai.models.countTokens({
  model: "gemini-2.5-flash",
  contents: createUserContent([
    createPartFromUri(myfile.uri, myfile.mimeType),
  ]),
});
console.log(countTokensResponse.totalTokens);

Go

package main

import (
  "context"
  "fmt"
  "os"
  "google.golang.org/genai"
)

func main() {
  ctx := context.Background()
  client, err := genai.NewClient(ctx, nil)
  if err != nil {
      log.Fatal(err)
  }

  localAudioPath := "/path/to/sample.mp3"
  uploadedFile, _ := client.Files.UploadFromPath(
      ctx,
      localAudioPath,
      nil,
  )

  parts := []*genai.Part{
      genai.NewPartFromURI(uploadedFile.URI, uploadedFile.MIMEType),
  }
  contents := []*genai.Content{
      genai.NewContentFromParts(parts, genai.RoleUser),
  }

  tokens, _ := client.Models.CountTokens(
      ctx,
      "gemini-2.5-flash",
      contents,
      nil,
  )

  fmt.Printf("File %s is %d tokens\n", localAudioPath, tokens.TotalTokens)
}

รูปแบบเสียงที่รองรับ

Gemini รองรับประเภท MIME ของรูปแบบเสียงต่อไปนี้

WAV - audio/wav
MP3 - audio/mp3
AIFF - audio/aiff
AAC - audio/aac
OGG Vorbis - audio/ogg
FLAC - audio/flac

รายละเอียดทางเทคนิคเกี่ยวกับเสียง

Gemini จะแสดงเสียงแต่ละวินาทีเป็นโทเค็น 32 รายการ เช่น เสียง 1 นาทีจะแสดงเป็นโทเค็น 1,920 รายการ
Gemini สามารถ "เข้าใจ" องค์ประกอบที่ไม่ใช่คำพูด เช่น เสียงนกร้องหรือเสียงไซเรน
ความยาวสูงสุดของข้อมูลเสียงที่รองรับในพรอมต์เดียวคือ 9.5 ชั่วโมง Gemini ไม่จำกัดจำนวนไฟล์เสียงในพรอมต์เดียว แต่ความยาวรวมของไฟล์เสียงทั้งหมดในพรอมต์เดียวต้องไม่เกิน 9.5 ชั่วโมง
Gemini จะลดขนาดไฟล์เสียงเป็นความละเอียดข้อมูล 16 Kbps
หากแหล่งที่มาของเสียงมีหลายช่อง Gemini จะรวมช่องเหล่านั้นเป็นช่องเดียว

ขั้นตอนถัดไป

คู่มือนี้จะแสดงวิธีสร้างข้อความเพื่อตอบสนองต่อข้อมูลเสียง ดูข้อมูลเพิ่มเติมได้ที่แหล่งข้อมูลต่อไปนี้

กลยุทธ์การแจ้งไฟล์: Gemini API รองรับการแจ้งด้วยข้อมูลข้อความ รูปภาพ เสียง และวิดีโอ หรือที่เรียกว่าการแจ้งแบบหลายรูปแบบ
คำสั่งของระบบ: คำสั่งของระบบช่วยให้คุณควบคุมลักษณะการทํางานของโมเดลตามความต้องการและกรณีการใช้งานที่เฉพาะเจาะจง
คำแนะนำด้านความปลอดภัย: บางครั้งโมเดล Generative AI จะสร้างเอาต์พุตที่ไม่คาดคิด เช่น เอาต์พุตที่ไม่ถูกต้อง มีอคติ หรือไม่เหมาะสม ขั้นตอนหลังการประมวลผลและการประเมินจากเจ้าหน้าที่เป็นสิ่งจําเป็นในการจำกัดความเสี่ยงของอันตรายจากเอาต์พุตดังกล่าว