I use the following code to create ‘path’, ‘sentence’, ‘audio’ from my collected datasets.
I wanted to know if the ‘audio’ column data are correcte and same format as the Mozilla Common Voice so that I can run the updated collab notebook from the blog.
import os
import soundfile as sf
import csv
from datasets import Dataset
# Directory containing your audio files
audio_dir = "/content/drive/MyDrive/1"
# Function to read audio files as bytes
def read_audio_file_as_bytes(audio_path):
with open(audio_path, 'rb') as audio_file:
audio_bytes = audio_file.read()
return audio_bytes
# List of audio file paths in the directory
audio_files = [os.path.join(audio_dir, filename) for filename in os.listdir(audio_dir) if filename.endswith(".wav")]
# Create a list of dictionaries, each containing 'path' and 'audio' fields
data = []
for audio_file in audio_files:
# Extract the base name without extension
base_name = os.path.splitext(audio_file)[0]
# Form the expected text file name (assuming it has the same name with .txt extension)
text_file = f"{base_name}.txt"
try:
# Read the content of the text file
with open(text_file, 'r') as file:
sentence_text = file.read()
except FileNotFoundError:
sentence_text = "" # If the text file doesn't exist
data.append({
'path': audio_file,
'audio': read_audio_file_as_bytes(audio_file),
'sentence': sentence_text, # Include the sentence text
'age': '', # Modify this to include the age information
'male': '' # Modify this to include the gender information
})
# Create a dataset using the data
audio_dataset = Dataset.from_dict({"path": [d['path'] for d in data],
"audio": [d['audio'] for d in data],
"sentence": [d['sentence'] for d in data],
"age": [d['age'] for d in data],
"male": [d['male'] for d in data]})
# Create a CSV file and write the data to it
csv_filename = "/content/drive/MyDrive/1/output.csv" # Modify the path as needed
with open(csv_filename, 'w', newline='') as csvfile:
fieldnames = ["path", "sentence", "audio", "age", "male"]
csv_writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
# Write the header
csv_writer.writeheader()
for record in audio_dataset:
csv_writer.writerow(record)
print(f"CSV file '{csv_filename}' has been created with 'path', 'sentence', 'audio', 'age', and 'male' columns.")