How to create the 'audio' column form the Mozilla Common Voice Project

cdsda44 · October 17, 2023, 10:31am

I use the following code to create ‘path’, ‘sentence’, ‘audio’ from my collected datasets.

I wanted to know if the ‘audio’ column data are correcte and same format as the Mozilla Common Voice so that I can run the updated collab notebook from the blog.

import os
import soundfile as sf
import csv
from datasets import Dataset

# Directory containing your audio files
audio_dir = "/content/drive/MyDrive/1"

# Function to read audio files as bytes
def read_audio_file_as_bytes(audio_path):
    with open(audio_path, 'rb') as audio_file:
        audio_bytes = audio_file.read()
    return audio_bytes

# List of audio file paths in the directory
audio_files = [os.path.join(audio_dir, filename) for filename in os.listdir(audio_dir) if filename.endswith(".wav")]

# Create a list of dictionaries, each containing 'path' and 'audio' fields
data = []
for audio_file in audio_files:
    # Extract the base name without extension
    base_name = os.path.splitext(audio_file)[0]

    # Form the expected text file name (assuming it has the same name with .txt extension)
    text_file = f"{base_name}.txt"

    try:
        # Read the content of the text file
        with open(text_file, 'r') as file:
            sentence_text = file.read()
    except FileNotFoundError:
        sentence_text = ""  # If the text file doesn't exist

    data.append({
        'path': audio_file,
        'audio': read_audio_file_as_bytes(audio_file),
        'sentence': sentence_text,  # Include the sentence text
        'age': '',  # Modify this to include the age information
        'male': ''  # Modify this to include the gender information
    })

# Create a dataset using the data
audio_dataset = Dataset.from_dict({"path": [d['path'] for d in data],
                                  "audio": [d['audio'] for d in data],
                                  "sentence": [d['sentence'] for d in data],
                                  "age": [d['age'] for d in data],
                                  "male": [d['male'] for d in data]})

# Create a CSV file and write the data to it
csv_filename = "/content/drive/MyDrive/1/output.csv"  # Modify the path as needed
with open(csv_filename, 'w', newline='') as csvfile:
    fieldnames = ["path", "sentence", "audio", "age", "male"]
    csv_writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    # Write the header
    csv_writer.writeheader()

    for record in audio_dataset:
        csv_writer.writerow(record)

print(f"CSV file '{csv_filename}' has been created with 'path', 'sentence', 'audio', 'age', and 'male' columns.")

Topic		Replies	Views
Create the Moxilla Common Voice Data 🤗Datasets	2	574	November 15, 2022
How to create a dataset like common voice? 🤗Datasets	2	462	January 31, 2022
How to create a dataset from a CSV for transcription 🤗Datasets	1	393	January 23, 2023
Please, help me 🤗Datasets	1	595	January 10, 2022
[SOLVED] How to import a custom dataset (wav2vec2 & Common Voice)? Beginners	5	1529	August 4, 2023

How to create the 'audio' column form the Mozilla Common Voice Project

Related Topics