Converting text stories into embeddings with metadata and uploading to Pinecone for chatbot and content creation

Hello everyone,

We have embarked on our first project involving a few hundred text stories (around 500mb). Our goal is to convert these text files into embeddings, add metadata specifying the source of each story, and upload the data to Pinecone. Once uploaded, we aim to utilize models like Koala 13b, GPT-4-x-aIpaca-13b-native-4bit-128g, and others with our private Pinecone data to develop chatbots and generate content.

We would be grateful for any suggestions, tutorials, or other valuable information to help us bring this project to life. We have all the necessary infrastructure in place and will be using Python as our preferred programming language.

Looking forward to your insights and guidance!

Best regards,
Ram.Sh

starting with the first step, It reads the text files, cleans them, generates embeddings, and uploads the data to Pinecone with the metadata included.

would that be the advised way to convert the text into vectors for our final goal?

import os
import re
import json
import spacy
import nltk
import uuid
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from gensim.models import Word2Vec
import pinecone

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize objects
stop_words = set(stopwords.words('english'))
stemmer = nltk.SnowballStemmer('english')
lemmatizer = nltk.WordNetLemmatizer()
nlp = spacy.load("en_core_web_md")

# Define function to clean text
def clean_text(text):
    # Lowercase text
    text = text.lower()
    # Remove non-alphabetic characters
    text = re.sub('[^A-Za-z]+', ' ', text)
    # Tokenize text
    words = word_tokenize(text)
    # Remove stopwords
    words = [word for word in words if word not in stop_words]
    # Rejoin words
    cleaned_text = ' '.join(words)
    return cleaned_text

# Define function to preprocess text for embeddings
def preprocess_text(text):
    # Tokenize the text into words
    words = nltk.word_tokenize(text.lower())
    # Remove stop words
    words = [word for word in words if word not in stop_words]
    # Stem and lemmatize the words
    words = [stemmer.stem(lemmatizer.lemmatize(word, pos='v')) for word in words]
    return words

# Define function to generate embeddings
def generate_story_embeddings(text):
    # Preprocess text
    words = preprocess_text(text)
    # Generate Word2Vec model
    model = Word2Vec([words], size=100, window=5, min_count=1, workers=4)
    # Extract embeddings
    embeddings = model.wv[words]
    return embeddings

# Initialize Pinecone
pinecone.init(api_key="your-pinecone-api-key")
pinecone_index = pinecone.Index(index_name="your-pinecone-index-name")

# Process text files
data_dir = "data/"

for filename in os.listdir(data_dir):
    if filename.endswith(".txt"):
        # Read text file
        with open(os.path.join(data_dir, filename), "r") as f:
            text = f.read()

        # Clean and generate embeddings
        cleaned_text = clean_text(text)
        embeddings = generate_story_embeddings(cleaned_text)

        # Add metadata with filename and hardcoded category
        metadata = {'filename': filename, 'category': 'hardcoded_category'}

        # Upload embeddings and metadata to Pinecone
        pinecone_index.upsert(str(uuid.uuid4()), embeddings, metadata=metadata)

# Deinitialize Pinecone when finished
pinecone.deinit()