এই পৃষ্ঠাটি Cloud Translation API অনুবাদ করেছে।

অডিও বোঝার

জেমিনি অডিও ইনপুট বিশ্লেষণ করতে পারে এবং টেক্সট প্রতিক্রিয়া তৈরি করতে পারে।

পাইথন

from google import genai

client = genai.Client()

myfile = client.files.upload(file="path/to/sample.mp3")

response = client.models.generate_content(
    model="gemini-3-flash-preview", contents=["Describe this audio clip", myfile]
)

print(response.text)

জাভাস্ক্রিপ্ট

import {
  GoogleGenAI,
  createUserContent,
  createPartFromUri,
} from "@google/genai";

const ai = new GoogleGenAI({});

async function main() {
  const myfile = await ai.files.upload({
    file: "path/to/sample.mp3",
    config: { mimeType: "audio/mp3" },
  });

  const response = await ai.models.generateContent({
    model: "gemini-3-flash-preview",
    contents: createUserContent([
      createPartFromUri(myfile.uri, myfile.mimeType),
      "Describe this audio clip",
    ]),
  });
  console.log(response.text);
}

await main();

যাও

package main

import (
    "context"
    "fmt"
    "os"
    "google.golang.org/genai"
)

func main() {
    ctx := context.Background()
    client, err := genai.NewClient(ctx, nil)
    if err != nil {
        log.Fatal(err)
    }

    localAudioPath := "/path/to/sample.mp3"
    uploadedFile, _ := client.Files.UploadFromPath(
        ctx,
        localAudioPath,
        nil,
    )

    parts := []*genai.Part{
        genai.NewPartFromText("Describe this audio clip"),
        genai.NewPartFromURI(uploadedFile.URI, uploadedFile.MIMEType),
    }
    contents := []*genai.Content{
        genai.NewContentFromParts(parts, genai.RoleUser),
    }

    result, _ := client.Models.GenerateContent(
        ctx,
        "gemini-3-flash-preview",
        contents,
        nil,
    )

    fmt.Println(result.Text())
}

বিশ্রাম

AUDIO_PATH="path/to/sample.mp3"
MIME_TYPE=$(file -b --mime-type "${AUDIO_PATH}")
NUM_BYTES=$(wc -c < "${AUDIO_PATH}")
DISPLAY_NAME=AUDIO

tmp_header_file=upload-header.tmp

# Initial resumable request defining metadata.
# The upload url is in the response headers dump them to a file.
curl "https://generativelanguage.googleapis.com/upload/v1beta/files" \
  -H "x-goog-api-key: $GEMINI_API_KEY" \
  -D upload-header.tmp \
  -H "X-Goog-Upload-Protocol: resumable" \
  -H "X-Goog-Upload-Command: start" \
  -H "X-Goog-Upload-Header-Content-Length: ${NUM_BYTES}" \
  -H "X-Goog-Upload-Header-Content-Type: ${MIME_TYPE}" \
  -H "Content-Type: application/json" \
  -d "{'file': {'display_name': '${DISPLAY_NAME}'}}" 2> /dev/null

upload_url=$(grep -i "x-goog-upload-url: " "${tmp_header_file}" | cut -d" " -f2 | tr -d "\r")
rm "${tmp_header_file}"

# Upload the actual bytes.
curl "${upload_url}" \
  -H "Content-Length: ${NUM_BYTES}" \
  -H "X-Goog-Upload-Offset: 0" \
  -H "X-Goog-Upload-Command: upload, finalize" \
  --data-binary "@${AUDIO_PATH}" 2> /dev/null > file_info.json

file_uri=$(jq ".file.uri" file_info.json)
echo file_uri=$file_uri

# Now generate content using that file
curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-3-flash-preview:generateContent" \
    -H "x-goog-api-key: $GEMINI_API_KEY" \
    -H 'Content-Type: application/json' \
    -X POST \
    -d '{
      "contents": [{
        "parts":[
          {"text": "Describe this audio clip"},
          {"file_data":{"mime_type": "${MIME_TYPE}", "file_uri": '$file_uri'}}]
        }]
      }' 2> /dev/null > response.json

cat response.json
echo

jq ".candidates[].content.parts[].text" response.json

সংক্ষিপ্ত বিবরণ

জেমিনি অডিও ইনপুট বিশ্লেষণ এবং বুঝতে পারে এবং এতে টেক্সট প্রতিক্রিয়া তৈরি করতে পারে, যার ফলে নিম্নলিখিত ব্যবহারের ক্ষেত্রে সক্ষমতা রয়েছে:

অডিও কন্টেন্ট সম্পর্কে বর্ণনা, সারসংক্ষেপ বা প্রশ্নের উত্তর দিন।
অডিওর (স্পিচ টু টেক্সট) একটি ট্রান্সক্রিপশন এবং অনুবাদ প্রদান করুন।
বিভিন্ন স্পিকার সনাক্ত করুন এবং লেবেল করুন (স্পিকার ডায়ারাইজেশন)।
কথা এবং সঙ্গীতে আবেগ সনাক্ত করুন।
অডিওর নির্দিষ্ট অংশ বিশ্লেষণ করুন এবং টাইমস্ট্যাম্প প্রদান করুন।

বর্তমানে জেমিনি এপিআই রিয়েল-টাইম ট্রান্সক্রিপশন ব্যবহারের ক্ষেত্রে সমর্থন করে না। রিয়েল-টাইম ভয়েস এবং ভিডিও ইন্টারঅ্যাকশনের জন্য লাইভ এপিআই দেখুন। রিয়েল-টাইম ট্রান্সক্রিপশনের জন্য ডেডিকেটেড স্পিচ টু টেক্সট মডেলের জন্য, গুগল ক্লাউড স্পিচ-টু-টেক্সট এপিআই ব্যবহার করুন।

বক্তৃতা থেকে টেক্সটে ট্রান্সক্রাইব করুন

এই উদাহরণ অ্যাপ্লিকেশনটি দেখায় কিভাবে জেমিনি API-কে বক্তৃতা প্রতিলিপি, অনুবাদ এবং সংক্ষিপ্তকরণের জন্য অনুরোধ করা যায়, যার মধ্যে রয়েছে টাইমস্ট্যাম্প, স্পিকার ডায়ারাইজেশন এবং স্ট্রাকচার্ড আউটপুট ব্যবহার করে আবেগ সনাক্তকরণ।

পাইথন

from google import genai
from google.genai import types

client = genai.Client()

YOUTUBE_URL = "https://www.youtube.com/watch?v=ku-N-eS1lgM"

def main():
  prompt = """
    Process the audio file and generate a detailed transcription.

    Requirements:
    1. Identify distinct speakers (e.g., Speaker 1, Speaker 2, or names if context allows).
    2. Provide accurate timestamps for each segment (Format: MM:SS).
    3. Detect the primary language of each segment.
    4. If the segment is in a language different than English, also provide the English translation.
    5. Identify the primary emotion of the speaker in this segment. You MUST choose exactly one of the following: Happy, Sad, Angry, Neutral.
    6. Provide a brief summary of the entire audio at the beginning.
  """

  response = client.models.generate_content(
    model="gemini-3-flash-preview",
    contents=[
      types.Content(
        parts=[
          types.Part(
            file_data=types.FileData(
              file_uri=YOUTUBE_URL
            )
          ),
          types.Part(
            text=prompt
          )
        ]
      )
    ],
    config=types.GenerateContentConfig(
      response_mime_type="application/json",
      response_schema=types.Schema(
        type=types.Type.OBJECT,
        properties={
          "summary": types.Schema(
            type=types.Type.STRING,
            description="A concise summary of the audio content.",
          ),
          "segments": types.Schema(
            type=types.Type.ARRAY,
            description="List of transcribed segments with speaker and timestamp.",
            items=types.Schema(
              type=types.Type.OBJECT,
              properties={
                "speaker": types.Schema(type=types.Type.STRING),
                "timestamp": types.Schema(type=types.Type.STRING),
                "content": types.Schema(type=types.Type.STRING),
                "language": types.Schema(type=types.Type.STRING),
                "language_code": types.Schema(type=types.Type.STRING),
                "translation": types.Schema(type=types.Type.STRING),
                "emotion": types.Schema(
                  type=types.Type.STRING,
                  enum=["happy", "sad", "angry", "neutral"]
                ),
              },
              required=["speaker", "timestamp", "content", "language", "language_code", "emotion"],
            ),
          ),
        },
        required=["summary", "segments"],
      ),
    ),
  )

  print(response.text)

if __name__ == "__main__":
  main()

জাভাস্ক্রিপ্ট

import {
  GoogleGenAI,
  Type
} from "@google/genai";

const ai = new GoogleGenAI({});

const YOUTUBE_URL = "https://www.youtube.com/watch?v=ku-N-eS1lgM";

async function main() {
  const prompt = `
      Process the audio file and generate a detailed transcription.

      Requirements:
      1. Identify distinct speakers (e.g., Speaker 1, Speaker 2, or names if context allows).
      2. Provide accurate timestamps for each segment (Format: MM:SS).
      3. Detect the primary language of each segment.
      4. If the segment is in a language different than English, also provide the English translation.
      5. Identify the primary emotion of the speaker in this segment. You MUST choose exactly one of the following: Happy, Sad, Angry, Neutral.
      6. Provide a brief summary of the entire audio at the beginning.
    `;

  const Emotion = {
    Happy: 'happy',
    Sad: 'sad',
    Angry: 'angry',
    Neutral: 'neutral'
  };

  const response = await ai.models.generateContent({
    model: "gemini-3-flash-preview",
    contents: {
      parts: [
        {
          fileData: {
            fileUri: YOUTUBE_URL,
          },
        },
        {
          text: prompt,
        },
      ],
    },
    config: {
      responseMimeType: "application/json",
      responseSchema: {
        type: Type.OBJECT,
        properties: {
          summary: {
            type: Type.STRING,
            description: "A concise summary of the audio content.",
          },
          segments: {
            type: Type.ARRAY,
            description: "List of transcribed segments with speaker and timestamp.",
            items: {
              type: Type.OBJECT,
              properties: {
                speaker: { type: Type.STRING },
                timestamp: { type: Type.STRING },
                content: { type: Type.STRING },
                language: { type: Type.STRING },
                language_code: { type: Type.STRING },
                translation: { type: Type.STRING },
                emotion: {
                  type: Type.STRING,
                  enum: Object.values(Emotion)
                },
              },
              required: ["speaker", "timestamp", "content", "language", "language_code", "emotion"],
            },
          },
        },
        required: ["summary", "segments"],
      },
    },
  });
  const json = JSON.parse(response.text);
  console.log(json);
}

await main();

বিশ্রাম

curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-3-flash-preview:generateContent" \
    -H "x-goog-api-key: $GEMINI_API_KEY" \
    -H 'Content-Type: application/json' \
    -X POST \
    -d '{
      "contents": [
        {
          "parts": [
            {
              "file_data": {
                "file_uri": "https://www.youtube.com/watch?v=ku-N-eS1lgM",
                "mime_type": "video/mp4"
              }
            },
            {
              "text": "Process the audio file and generate a detailed transcription.\n\nRequirements:\n1. Identify distinct speakers (e.g., Speaker 1, Speaker 2, or names if context allows).\n2. Provide accurate timestamps for each segment (Format: MM:SS).\n3. Detect the primary language of each segment.\n4. If the segment is in a language different than English, also provide the English translation.\n5. Identify the primary emotion of the speaker in this segment. You MUST choose exactly one of the following: Happy, Sad, Angry, Neutral.\n6. Provide a brief summary of the entire audio at the beginning."
            }
          ]
        }
      ],
      "generation_config": {
        "response_mime_type": "application/json",
        "response_schema": {
          "type": "OBJECT",
          "properties": {
            "summary": {
              "type": "STRING",
              "description": "A concise summary of the audio content."
            },
            "segments": {
              "type": "ARRAY",
              "description": "List of transcribed segments with speaker and timestamp.",
              "items": {
                "type": "OBJECT",
                "properties": {
                  "speaker": { "type": "STRING" },
                  "timestamp": { "type": "STRING" },
                  "content": { "type": "STRING" },
                  "language": { "type": "STRING" },
                  "language_code": { "type": "STRING" },
                  "translation": { "type": "STRING" },
                  "emotion": {
                    "type": "STRING",
                    "enum": ["happy", "sad", "angry", "neutral"]
                  }
                },
                "required": ["speaker", "timestamp", "content", "language", "language_code", "emotion"]
              }
            }
          },
          "required": ["summary", "segments"]
        }
      }
    }' 2> /dev/null > response.json

cat response.json
echo

jq ".candidates[].content.parts[].text" response.json

আপনি একটি বোতামে ক্লিক করেই AI Studio Build-কে এই উদাহরণ ট্রান্সক্রিপশন অ্যাপের মতো একটি অ্যাপ তৈরি করতে অনুরোধ করতে পারেন।

একটি বহুভাষিক অডিও ট্রান্সক্রিপশন জেমিনি অ্যাপ

অডিও ইনপুট করুন

আপনি নিম্নলিখিত উপায়ে জেমিনিকে অডিও ডেটা সরবরাহ করতে পারেন:

generateContent অনুরোধ করার আগে একটি অডিও ফাইল আপলোড করুন ।
generateContent অনুরোধের সাথে ইনলাইন অডিও ডেটা পাস করুন ।

অন্যান্য ফাইল ইনপুট পদ্ধতি সম্পর্কে জানতে, ফাইল ইনপুট পদ্ধতি নির্দেশিকাটি দেখুন।

একটি অডিও ফাইল আপলোড করুন

আপনি একটি অডিও ফাইল আপলোড করার জন্য Files API ব্যবহার করতে পারেন। যখন মোট অনুরোধের আকার (ফাইল, টেক্সট প্রম্পট, সিস্টেম নির্দেশাবলী ইত্যাদি সহ) 20 MB এর চেয়ে বড় হয় তখন সর্বদা Files API ব্যবহার করুন।

নিম্নলিখিত কোডটি একটি অডিও ফাইল আপলোড করে এবং তারপর generateContent কলে ফাইলটি ব্যবহার করে।

পাইথন

from google import genai

client = genai.Client()

myfile = client.files.upload(file="path/to/sample.mp3")

response = client.models.generate_content(
    model="gemini-3-flash-preview", contents=["Describe this audio clip", myfile]
)

print(response.text)

জাভাস্ক্রিপ্ট

import {
  GoogleGenAI,
  createUserContent,
  createPartFromUri,
} from "@google/genai";

const ai = new GoogleGenAI({});

async function main() {
  const myfile = await ai.files.upload({
    file: "path/to/sample.mp3",
    config: { mimeType: "audio/mp3" },
  });

  const response = await ai.models.generateContent({
    model: "gemini-3-flash-preview",
    contents: createUserContent([
      createPartFromUri(myfile.uri, myfile.mimeType),
      "Describe this audio clip",
    ]),
  });
  console.log(response.text);
}

await main();

যাও

package main

import (
  "context"
  "fmt"
  "os"
  "google.golang.org/genai"
)

func main() {
  ctx := context.Background()
  client, err := genai.NewClient(ctx, nil)
  if err != nil {
      log.Fatal(err)
  }

  localAudioPath := "/path/to/sample.mp3"
  uploadedFile, _ := client.Files.UploadFromPath(
      ctx,
      localAudioPath,
      nil,
  )

  parts := []*genai.Part{
      genai.NewPartFromText("Describe this audio clip"),
      genai.NewPartFromURI(uploadedFile.URI, uploadedFile.MIMEType),
  }
  contents := []*genai.Content{
      genai.NewContentFromParts(parts, genai.RoleUser),
  }

  result, _ := client.Models.GenerateContent(
      ctx,
      "gemini-3-flash-preview",
      contents,
      nil,
  )

  fmt.Println(result.Text())
}

বিশ্রাম

AUDIO_PATH="path/to/sample.mp3"
MIME_TYPE=$(file -b --mime-type "${AUDIO_PATH}")
NUM_BYTES=$(wc -c < "${AUDIO_PATH}")
DISPLAY_NAME=AUDIO

tmp_header_file=upload-header.tmp

# Initial resumable request defining metadata.
# The upload url is in the response headers dump them to a file.
curl "https://generativelanguage.googleapis.com/upload/v1beta/files" \
  -H "x-goog-api-key: $GEMINI_API_KEY" \
  -D upload-header.tmp \
  -H "X-Goog-Upload-Protocol: resumable" \
  -H "X-Goog-Upload-Command: start" \
  -H "X-Goog-Upload-Header-Content-Length: ${NUM_BYTES}" \
  -H "X-Goog-Upload-Header-Content-Type: ${MIME_TYPE}" \
  -H "Content-Type: application/json" \
  -d "{'file': {'display_name': '${DISPLAY_NAME}'}}" 2> /dev/null

upload_url=$(grep -i "x-goog-upload-url: " "${tmp_header_file}" | cut -d" " -f2 | tr -d "\r")
rm "${tmp_header_file}"

# Upload the actual bytes.
curl "${upload_url}" \
  -H "Content-Length: ${NUM_BYTES}" \
  -H "X-Goog-Upload-Offset: 0" \
  -H "X-Goog-Upload-Command: upload, finalize" \
  --data-binary "@${AUDIO_PATH}" 2> /dev/null > file_info.json

file_uri=$(jq ".file.uri" file_info.json)
echo file_uri=$file_uri

# Now generate content using that file
curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-3-flash-preview:generateContent" \
    -H "x-goog-api-key: $GEMINI_API_KEY" \
    -H 'Content-Type: application/json' \
    -X POST \
    -d '{
      "contents": [{
        "parts":[
          {"text": "Describe this audio clip"},
          {"file_data":{"mime_type": "${MIME_TYPE}", "file_uri": '$file_uri'}}]
        }]
      }' 2> /dev/null > response.json

cat response.json
echo

jq ".candidates[].content.parts[].text" response.json

মিডিয়া ফাইল নিয়ে কাজ করার বিষয়ে আরও জানতে, Files API দেখুন।

অডিও ডেটা ইনলাইনে পাস করুন

একটি অডিও ফাইল আপলোড করার পরিবর্তে, আপনি generateContent এর অনুরোধে ইনলাইন অডিও ডেটা পাস করতে পারেন:

পাইথন

from google import genai
from google.genai import types

with open('path/to/small-sample.mp3', 'rb') as f:
    audio_bytes = f.read()

client = genai.Client()
response = client.models.generate_content(
  model='gemini-3-flash-preview',
  contents=[
    'Describe this audio clip',
    types.Part.from_bytes(
      data=audio_bytes,
      mime_type='audio/mp3',
    )
  ]
)

print(response.text)

জাভাস্ক্রিপ্ট

import { GoogleGenAI } from "@google/genai";
import * as fs from "node:fs";

const ai = new GoogleGenAI({});
const base64AudioFile = fs.readFileSync("path/to/small-sample.mp3", {
  encoding: "base64",
});

const contents = [
  { text: "Please summarize the audio." },
  {
    inlineData: {
      mimeType: "audio/mp3",
      data: base64AudioFile,
    },
  },
];

const response = await ai.models.generateContent({
  model: "gemini-3-flash-preview",
  contents: contents,
});
console.log(response.text);

যাও

package main

import (
  "context"
  "fmt"
  "os"
  "google.golang.org/genai"
)

func main() {
  ctx := context.Background()
  client, err := genai.NewClient(ctx, nil)
  if err != nil {
      log.Fatal(err)
  }

  audioBytes, _ := os.ReadFile("/path/to/small-sample.mp3")

  parts := []*genai.Part{
      genai.NewPartFromText("Describe this audio clip"),
    &genai.Part{
      InlineData: &genai.Blob{
        MIMEType: "audio/mp3",
        Data:     audioBytes,
      },
    },
  }
  contents := []*genai.Content{
      genai.NewContentFromParts(parts, genai.RoleUser),
  }

  result, _ := client.Models.GenerateContent(
      ctx,
      "gemini-3-flash-preview",
      contents,
      nil,
  )

  fmt.Println(result.Text())
}

ইনলাইন অডিও ডেটা সম্পর্কে কিছু বিষয় মনে রাখতে হবে:

সর্বাধিক অনুরোধের আকার হল ২০ মেগাবাইট, যার মধ্যে টেক্সট প্রম্পট, সিস্টেম নির্দেশাবলী এবং ইনলাইনে প্রদত্ত ফাইল অন্তর্ভুক্ত। যদি আপনার ফাইলের আকার মোট অনুরোধের আকার ২০ মেগাবাইট ছাড়িয়ে যায়, তাহলে অনুরোধে ব্যবহারের জন্য একটি অডিও ফাইল আপলোড করতে Files API ব্যবহার করুন।
যদি আপনি একাধিকবার একটি অডিও নমুনা ব্যবহার করেন, তাহলে একটি অডিও ফাইল আপলোড করা আরও কার্যকর।

একটি প্রতিলিপি পান

অডিও ডেটার একটি ট্রান্সক্রিপ্ট পেতে, প্রম্পটে এটি জিজ্ঞাসা করুন:

পাইথন

from google import genai

client = genai.Client()
myfile = client.files.upload(file='path/to/sample.mp3')
prompt = 'Generate a transcript of the speech.'

response = client.models.generate_content(
  model='gemini-3-flash-preview',
  contents=[prompt, myfile]
)

print(response.text)

জাভাস্ক্রিপ্ট

import {
  GoogleGenAI,
  createUserContent,
  createPartFromUri,
} from "@google/genai";

const ai = new GoogleGenAI({});
const myfile = await ai.files.upload({
  file: "path/to/sample.mp3",
  config: { mimeType: "audio/mpeg" },
});

const result = await ai.models.generateContent({
  model: "gemini-3-flash-preview",
  contents: createUserContent([
    createPartFromUri(myfile.uri, myfile.mimeType),
    "Generate a transcript of the speech.",
  ]),
});
console.log("result.text=", result.text);

যাও

package main

import (
  "context"
  "fmt"
  "os"
  "google.golang.org/genai"
)

func main() {
  ctx := context.Background()
  client, err := genai.NewClient(ctx, nil)
  if err != nil {
      log.Fatal(err)
  }

  localAudioPath := "/path/to/sample.mp3"
  uploadedFile, _ := client.Files.UploadFromPath(
      ctx,
      localAudioPath,
      nil,
  )

  parts := []*genai.Part{
      genai.NewPartFromText("Generate a transcript of the speech."),
      genai.NewPartFromURI(uploadedFile.URI, uploadedFile.MIMEType),
  }
  contents := []*genai.Content{
      genai.NewContentFromParts(parts, genai.RoleUser),
  }

  result, _ := client.Models.GenerateContent(
      ctx,
      "gemini-3-flash-preview",
      contents,
      nil,
  )

  fmt.Println(result.Text())
}

টাইমস্ট্যাম্পগুলি দেখুন

আপনি MM:SS ফর্মের টাইমস্ট্যাম্প ব্যবহার করে একটি অডিও ফাইলের নির্দিষ্ট অংশগুলি উল্লেখ করতে পারেন। উদাহরণস্বরূপ, নিম্নলিখিত প্রম্পটটি একটি ট্রান্সক্রিপ্টের অনুরোধ করে যা

ফাইলের শুরু থেকে ২ মিনিট ৩০ সেকেন্ডে শুরু হয়।
ফাইলের শুরু থেকে ৩ মিনিট ২৯ সেকেন্ডে শেষ হয়।

পাইথন

# Create a prompt containing timestamps.
prompt = "Provide a transcript of the speech from 02:30 to 03:29."

জাভাস্ক্রিপ্ট

// Create a prompt containing timestamps.
const prompt = "Provide a transcript of the speech from 02:30 to 03:29."

যাও

package main

import (
  "context"
  "fmt"
  "os"
  "google.golang.org/genai"
)

func main() {
  ctx := context.Background()
  client, err := genai.NewClient(ctx, nil)
  if err != nil {
      log.Fatal(err)
  }

  localAudioPath := "/path/to/sample.mp3"
  uploadedFile, _ := client.Files.UploadFromPath(
      ctx,
      localAudioPath,
      nil,
  )

  parts := []*genai.Part{
      genai.NewPartFromText("Provide a transcript of the speech " +
                            "between the timestamps 02:30 and 03:29."),
      genai.NewPartFromURI(uploadedFile.URI, uploadedFile.MIMEType),
  }
  contents := []*genai.Content{
      genai.NewContentFromParts(parts, genai.RoleUser),
  }

  result, _ := client.Models.GenerateContent(
      ctx,
      "gemini-3-flash-preview",
      contents,
      nil,
  )

  fmt.Println(result.Text())
}

টোকেন গণনা করুন

একটি অডিও ফাইলে টোকেনের সংখ্যা গণনা করতে countTokens পদ্ধতিটি কল করুন। উদাহরণস্বরূপ:

পাইথন

from google import genai

client = genai.Client()
response = client.models.count_tokens(
  model='gemini-3-flash-preview',
  contents=[myfile]
)

print(response)

জাভাস্ক্রিপ্ট

import {
  GoogleGenAI,
  createUserContent,
  createPartFromUri,
} from "@google/genai";

const ai = new GoogleGenAI({});
const myfile = await ai.files.upload({
  file: "path/to/sample.mp3",
  config: { mimeType: "audio/mpeg" },
});

const countTokensResponse = await ai.models.countTokens({
  model: "gemini-3-flash-preview",
  contents: createUserContent([
    createPartFromUri(myfile.uri, myfile.mimeType),
  ]),
});
console.log(countTokensResponse.totalTokens);

যাও

package main

import (
  "context"
  "fmt"
  "os"
  "google.golang.org/genai"
)

func main() {
  ctx := context.Background()
  client, err := genai.NewClient(ctx, nil)
  if err != nil {
      log.Fatal(err)
  }

  localAudioPath := "/path/to/sample.mp3"
  uploadedFile, _ := client.Files.UploadFromPath(
      ctx,
      localAudioPath,
      nil,
  )

  parts := []*genai.Part{
      genai.NewPartFromURI(uploadedFile.URI, uploadedFile.MIMEType),
  }
  contents := []*genai.Content{
      genai.NewContentFromParts(parts, genai.RoleUser),
  }

  tokens, _ := client.Models.CountTokens(
      ctx,
      "gemini-3-flash-preview",
      contents,
      nil,
  )

  fmt.Printf("File %s is %d tokens\n", localAudioPath, tokens.TotalTokens)
}

সমর্থিত অডিও ফর্ম্যাটগুলি

জেমিনি নিম্নলিখিত অডিও ফর্ম্যাট MIME প্রকারগুলিকে সমর্থন করে:

WAV - audio/wav
MP3 - audio/mp3
AIFF - audio/aiff
AAC - audio/aac
OGG Vorbis - audio/ogg
FLAC - audio/flac

অডিও সম্পর্কে প্রযুক্তিগত বিবরণ

জেমিনি প্রতিটি সেকেন্ডের অডিওকে ৩২টি টোকেন হিসেবে উপস্থাপন করে; উদাহরণস্বরূপ, এক মিনিটের অডিওকে ১,৯২০টি টোকেন হিসেবে উপস্থাপন করা হয়।
মিথুন রাশির জাতক জাতিকারা পাখির গান বা সাইরেনের মতো বক্তৃতা-বহির্ভূত উপাদানগুলিকে "বুঝতে" পারে।
একটি একক প্রম্পটে অডিও ডেটার সর্বাধিক সমর্থিত দৈর্ঘ্য হল ৯.৫ ঘন্টা। জেমিনি একটি একক প্রম্পটে অডিও ফাইলের সংখ্যা সীমাবদ্ধ করে না; তবে, একটি একক প্রম্পটে সমস্ত অডিও ফাইলের মোট দৈর্ঘ্য ৯.৫ ঘন্টার বেশি হতে পারে না।
জেমিনি অডিও ফাইলগুলিকে ১৬ Kbps ডেটা রেজোলিউশনে ডাউনসাম্পল করে।
যদি অডিও উৎসে একাধিক চ্যানেল থাকে, তাহলে জেমিনি সেই চ্যানেলগুলিকে একটি একক চ্যানেলে একত্রিত করে।

এরপর কি?

এই নির্দেশিকাটি অডিও ডেটার প্রতিক্রিয়ায় কীভাবে টেক্সট তৈরি করতে হয় তা দেখায়। আরও জানতে, নিম্নলিখিত সংস্থানগুলি দেখুন:

ফাইল প্রম্পটিং কৌশল : জেমিনি এপিআই টেক্সট, ইমেজ, অডিও এবং ভিডিও ডেটা সহ প্রম্পটিং সমর্থন করে, যা মাল্টিমোডাল প্রম্পটিং নামেও পরিচিত।
সিস্টেম নির্দেশাবলী : সিস্টেম নির্দেশাবলী আপনাকে আপনার নির্দিষ্ট চাহিদা এবং ব্যবহারের ক্ষেত্রের উপর ভিত্তি করে মডেলের আচরণ পরিচালনা করতে দেয়।
নিরাপত্তা নির্দেশিকা : কখনও কখনও জেনারেটিভ এআই মডেলগুলি অপ্রত্যাশিত আউটপুট তৈরি করে, যেমন আউটপুটগুলি ভুল, পক্ষপাতদুষ্ট বা আক্রমণাত্মক। এই ধরনের আউটপুট থেকে ক্ষতির ঝুঁকি সীমিত করার জন্য পোস্ট-প্রসেসিং এবং মানব মূল্যায়ন অপরিহার্য।