How would you determine a principled scale of data for finetuning a model?

I’m working on a test to see how much data is needed in order to properly, or “ideally”, finetune an embedding for a more targeted problem. Forgive me if I’m wrong about this approach, but -

If an embedding represents, in some sense, the linguistic “address” of a given text,
And finetuning is tweaking the embedding space to better represent some set corpus,
Then, in theory, would finetuning a corpus help a downstream regression model predict some outcome variable of interest?

In this example, I have ≈120k descriptions of cars, along with their final sale price, in an encoded_texts.json file (ldjson, {"price": float, "text": text, "id": {"$oid": str}}).

My thought was that I could embed the text of each observation, then take that matrix of embeddings for all observations to predict the price in an XGBoost model - the supposition would be that, as you increase the number of texts you finetune with to create a finetuned model, the r^2/mean/median error should indicate that the model is getting more accurate, since (again correct me if I’m wrong in my very general assumptions here) the model would be “better representing” these texts, and thus, capture more of the nuance of the text, which in theory would help explain more of the explained variance of the pricing downstream of the description. Is this totally off base or is this a reasonable chain of reasoning?

Given that question, my code is as follows - I invite any sense making here as I’m still not quite sure what the newest syntax is - I had an older model for this working several years ago that did produce exactly these results (as I increased the number of finetuning records I observed increasing R2) but this new draft version I’m working on is returning… not promising results (and also has the problem that I’m not handling an early stopping criteria when I think one is clearly required as we increase the scale of data). I would love for some discussion and back and forth about all this, both with regards to the initial chain of reasoning, as well as the particular implementation I’m drafting here, and whether or not there’s some conceptual errors that easily explain the results I’m seeing (R2 is decreasing within added scale for finetuning at least for the first few training steps):

from sklearn.metrics import r2_score
from datasets import Dataset
from sentence_transformers import SentenceTransformer
import numpy as np
from transformers import AutoModel, AutoModelForMaskedLM, AutoTokenizer
from transformers import DataCollatorForLanguageModeling, DataCollatorForWholeWordMask
from transformers import Trainer, TrainingArguments
import torch
import sys
import gzip
import json
from datetime import datetime
from sklearn.model_selection import KFold
import xgboost as xgb
import pandas as pd
from scipy.stats import linregress
from sentence_transformers import SentenceTransformer, SentenceTransformerTrainer, SentenceTransformerTrainingArguments
from sentence_transformers.losses import ContrastiveTensionLoss
from sentence_transformers.losses import ContrastiveTensionDataLoader
import numpy as np
import json
from datetime import datetime
MODEL_NAME = "sentence-transformers/all-distilroberta-v1"

class Finetuner:
    @staticmethod
    def evaluate_model(transformer, file_path="encoded_texts.json"):
        dataset = [json.loads(e) for e in open(file_path).read().split("\n")[:-1]]
        embeddings = transformer.encode([e["text"] for e in dataset])
        encoded_texts = []
        for i, row in enumerate(dataset):
            encoded_texts.append({"id": row["id"]["$oid"], "price": row["price"], "embedding": embeddings[i]})
        np.random.shuffle(encoded_texts)
        model = xgb.XGBRegressor(
            n_estimators=1800,
            learning_rate=0.02,
            max_depth=8,
            min_child_weight=9,
            subsample=0.9,
            gamma=0.1,
            colsample_bytree=1.0,
            tree_method='gpu_hist',
            n_jobs=400
        )
        embedding_data = []
        target = []
        ids = []
        kf = KFold(n_splits=5)
        for target_data in encoded_texts:
            embedding_data.append(target_data["embedding"])
            target.append(np.log(target_data["price"]+1))
            ids.append(target_data["id"])
        embeddings = pd.DataFrame([e["embedding"] for e in encoded_texts])
        target = pd.DataFrame([e["price"] for e in encoded_texts])
        ids = pd.DataFrame([e["id"] for e in encoded_texts])
        y_preds = []
        actual_ys = []
        actual_ids = []
        for train_index, test_index in kf.split(target):
            print(train_index)
            X_train, X_test = embeddings.iloc[train_index], embeddings.iloc[test_index]
            y_train, y_test = target.iloc[train_index], target.iloc[test_index]
            id_train, id_test = ids.iloc[train_index], ids.iloc[test_index]
            for id_val in [e[0] for e in id_test.values.tolist()]:
                actual_ids.append(id_val)
            for y_val in [e[0] for e in y_test.values.tolist()]:
                actual_ys.append(y_val)
            model.fit(X_train, y_train)
            for y_pred in model.predict(X_test):
                y_preds.append(y_pred)
        return {
            "r2_score": r2_score(actual_ys, y_preds),
            "median": np.median(np.abs(np.array(actual_ys) - y_preds)),
            "mean": np.mean(np.abs(np.array(actual_ys) - y_preds))
        }
    
    @staticmethod
    def finetune_model(output_model_name="finetuned_model", document_sample_count=1000, 
                       file_path="encoded_texts.json", model_name=MODEL_NAME, 
                       per_device_train_batch_size=16, save_steps=1000, use_fp16=True):
        # Load pre-trained model
        model = SentenceTransformer(model_name, trust_remote_code=True)
        # Load and preprocess the dataset
        train_sentences = [json.loads(e)["text"] for e in open(file_path).read().split("\n")[:-1]]
        np.random.shuffle(train_sentences)

        # Adjust the number of epochs - increasing slightly with more data
        base_epochs = 50  # Start with a base number of epochs
        num_train_epochs = max(base_epochs, int(base_epochs * (document_sample_count / 1000) ** 0.25))

        # Define data loader with Contrastive Tension Loss
        train_dataloader = ContrastiveTensionDataLoader(train_sentences[:document_sample_count], 
                                                        batch_size=per_device_train_batch_size, 
                                                        pos_neg_ratio=4)
        # Define the loss function
        loss = ContrastiveTensionLoss(model=model)
        training_args = SentenceTransformerTrainingArguments(
            output_dir=f"output/{output_model_name}-{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}",
            num_train_epochs=num_train_epochs,
            per_device_train_batch_size=per_device_train_batch_size,
            fp16=use_fp16,
            save_steps=save_steps,
            logging_steps=save_steps,
            evaluation_strategy="steps",
            eval_steps=save_steps,
            save_total_limit=1,
            gradient_accumulation_steps=4,  # Gradient accumulation
        )
        model.fit(
            [(train_dataloader, loss)],
            epochs=num_train_epochs,
            output_path=f"output/{output_model_name}-{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}"
        )
        model.save_pretrained(training_args.output_dir)
        print(f"Training done and model saved to: {training_args.output_dir} with {num_train_epochs} epochs")
        return model, training_args.output_dir
    
    @staticmethod
    def cosine_similarity_numpy(a, b):
        a = np.array(a)
        b = np.array(b)
        dot_product = np.dot(a, b.T)
        a_norm = np.linalg.norm(a, axis=-1, keepdims=True)
        b_norm = np.linalg.norm(b, axis=-1, keepdims=True)
        similarity = dot_product / (a_norm * b_norm.T)
        return similarity
    
    @staticmethod
    def consistency_check(model, output_dir, input_text):
        base_model = SentenceTransformer(MODEL_NAME, trust_remote_code=True)
        loaded_model = SentenceTransformer(output_dir, trust_remote_code=True)
        assert Finetuner.cosine_similarity_numpy(model.encode(input_text), loaded_model.encode(input_text)) > 0.99
    
    @staticmethod
    def run(finetune_document_sample_count=1000, file_path="encoded_texts.json"):
        model, output_dir = Finetuner.finetune_model(document_sample_count=finetune_document_sample_count, file_path=file_path)
        Finetuner.consistency_check(model, output_dir, "hello world")
        return Finetuner.evaluate_model(model, file_path=file_path)
    
    @staticmethod
    def full_finetuning_test():
        results = []
        for i in np.arange(1000, 120000, 1000):
            test_result = Finetuner.run(i)
            print(test_result)
            results.append(test_result)
        return results