Issue with code

Hello all! I’m having an error message with this code:
import pandas as pd
import numpy as np
from transformers import pipeline
import torch
from scipy.stats import entropy
from datetime import datetime

def batch_zero_shot_classification(
texts,
zero_shot_pipeline,
candidate_labels,
hypothesis_template=“This text is about {}.”,
batch_size=16
):
all_results =
n = len(texts)
for start_i in range(0, n, batch_size):
end_i = min(start_i + batch_size, n)
batch_texts = texts[start_i:end_i]

    batch_outputs = zero_shot_pipeline(
        batch_texts,
        candidate_labels=candidate_labels,
        hypothesis_template=hypothesis_template
    )

    if isinstance(batch_outputs, dict):
        batch_outputs = [batch_outputs]

    all_results.extend(batch_outputs)

return all_results

def soft_label_filter(results, min_confidence=0.5, entropy_threshold=1.2):
clean =
for result in results:
scores = result[“scores”]
labels = result[“labels”]
top_label = labels[0]
top_score = scores[0]
label_entropy = entropy(scores)
if top_score >= min_confidence and label_entropy <= entropy_threshold:
clean.append(top_label)
else:
clean.append(“uncertain”)
return clean

def main(excel_file_path, sheet_name=0):
device_id = 0 if torch.cuda.is_available() else -1
batch_size_ca = 16
batch_size_topics = 16

print("Reading Excel file...")
df = pd.read_excel(excel_file_path, sheet_name=sheet_name, dtype=str)
required_columns = ["Title", "subjectTerms", "classification", "identifierKeywords", "pubdate"]
for col in required_columns:
    if col not in df.columns:
        raise ValueError(f"Required column '{col}' is missing from the Excel file.")

df["pubdate"] = pd.to_datetime(df["pubdate"], errors="coerce")
df.dropna(subset=["pubdate"], inplace=True)
df[required_columns] = df[required_columns].fillna("")
df["combined_text"] = (
    df["Title"] + " " +
    df["subjectTerms"] + " " +
    df["classification"] + " " +
    df["identifierKeywords"]
).str.strip()

candidate_labels_for_california = ["California", "Not California"]
print("Loading zero-shot pipeline for California detection...")
zero_shot_california = pipeline(
    "zero-shot-classification",
    model="facebook/bart-large-mnli",
    device=device_id
)

print("Classifying for California relevance...")
texts_ca = df["combined_text"].tolist()
ca_results = batch_zero_shot_classification(
    texts=texts_ca,
    zero_shot_pipeline=zero_shot_california,
    candidate_labels=candidate_labels_for_california,
    batch_size=batch_size_ca
)

filtered_labels = soft_label_filter(ca_results)
df["is_california"] = [lbl == "California" for lbl in filtered_labels]

removed_df = df[~df["is_california"]].copy()
kept_df = df[df["is_california"]].copy()
print("Removed:", len(removed_df))
removed_df.to_excel("removed_articles.xlsx", index=False)

df = kept_df.reset_index(drop=True)

print("Loading zero-shot pipeline for topic detection...")
zero_shot_topic = pipeline(
    "zero-shot-classification",
    model="facebook/bart-large-mnli",
    device=device_id
)

df_texts = df["combined_text"].tolist()
topic_results = batch_zero_shot_classification(
    texts=df_texts,
    zero_shot_pipeline=zero_shot_topic,
    candidate_labels=[ ],  # Let AI generate new categories
    batch_size=batch_size_topics
)

predicted_categories = soft_label_filter(topic_results)
df["predicted_category"] = predicted_categories

df_2021 = df[df["pubdate"].dt.year == 2021].copy()
df_2021["iso_year"] = df_2021["pubdate"].dt.isocalendar().year
df_2021["iso_week"] = df_2021["pubdate"].dt.isocalendar().week
df_2021["iso_day"] = df_2021["pubdate"].dt.isocalendar().day
df_2021 = df_2021[df_2021["iso_year"] == 2021]

weekly_top_categories = { }
for week_number in sorted(df_2021["iso_week"].unique()):
    subset_week = df_2021[df_2021["iso_week"] == week_number]
    if len(subset_week) == 0:
        continue
    top_cat = subset_week["predicted_category"].value_counts().idxmax()
    weekly_top_categories[week_number] = top_cat

print("\\n=== TOP CATEGORIES BY ISO WEEK (2021) ===")
for week_num in range(1, 54):
    if week_num in weekly_top_categories:
        print(f"ISO Week {week_num}: {weekly_top_categories[week_num]}")
    else:
        print(f"ISO Week {week_num}: No data")

df_2021.to_excel("kept_and_categorized_articles_2021.xlsx", index=False)
print("Done.")

if name == “main”:
excel_file_path = r"C:\Users\CENSORED\Downloads\DocumentschatGPT.xlsx"
main(excel_file_path)
import numpy as np
import pandas as pd

def external_gamma_per_row(df, columns, base_column=“col_4”):
gamma = df[base_column] / len(columns)
gamma = gamma.replace(0, 1e-6)

adjusted_columns = [ ]
for col in columns:
    adjusted = df[col] / gamma
    adjusted_columns.append(adjusted)

result = np.prod(adjusted_columns, axis=0)
return result

Here’s some background information:
major events from California spanning from 1/1/2021 to 12/31/2021. The project’s data is held on an Excel spreadsheet, in which the following data is provided: Title, pubdate (published date), document URL, identifier key (some key things/subjects about the article), Subject Term (who’s the main person/group/association the article relates to), and Objects (very similar to subject term). I’m using HuggingFace’s RoBERTa for this project, as well as llama and zero shot classification. I’ve tasked the AI with doing the following: Remove articles that are not related to California, then group the articles by week (52 weeks total), then summarize the most prominent event that week (For example, if a majority of news articles in week 16 talk about a new policy, than, Week 16: New Policy.). The AI was tasked with figuring out whether or not an article is related to California, and if it is, what’s its talking about by looking at the Title and the identifier keys.

The results should look like:
Week 1: (X)
Week 2: (X)

Week 52: (x)
And
Month 1: (x)

Month 12: (x).

Any advice?

1 Like

I tried having it corrected by Hugging Chat, but it’s a mystery whether it works.

If the code as a whole is not working properly, it is best to break it down into parts and debug them one by one. You can see how well it is working and where it is going wrong…

Well, you could just leave it all to the AI.

import pandas as pd
import numpy as np
from transformers import pipeline
import torch
from scipy.stats import entropy
from datetime import datetime

def batch_zero_shot_classification(
    texts,
    zero_shot_pipeline,
    candidate_labels,
    hypothesis_template="This text is about {}.",
    batch_size=16
):
    all_results = []
    n = len(texts)
    for start_i in range(0, n, batch_size):
        end_i = min(start_i + batch_size, n)
        batch_texts = texts[start_i:end_i]

        batch_outputs = zero_shot_pipeline(
            batch_texts,
            candidate_labels=candidate_labels,
            hypothesis_template=hypothesis_template
        )

        if isinstance(batch_outputs, dict):
            batch_outputs = [batch_outputs]

        all_results.extend(batch_outputs)

    return all_results

def soft_label_filter(results, min_confidence=0.5, entropy_threshold=1.2):
    clean = []
    for result in results:
        scores = result["scores"]
        labels = result["labels"]
        top_label = labels[0]
        top_score = scores[0]
        label_entropy = entropy(scores)
        if top_score >= min_confidence and label_entropy <= entropy_threshold:
            clean.append(top_label)
        else:
            clean.append("uncertain")
    return clean

def main(excel_file_path, sheet_name=0):
    device_id = 0 if torch.cuda.is_available() else -1
    batch_size_ca = 16
    batch_size_topics = 16

    print("Reading Excel file...")
    df = pd.read_excel(excel_file_path, sheet_name=sheet_name, dtype=str)
    required_columns = ["Title", "subjectTerms", "classification", "identifierKeywords", "pubdate"]
    for col in required_columns:
        if col not in df.columns:
            raise ValueError(f"Required column '{col}' is missing from the Excel file.")

    df["pubdate"] = pd.to_datetime(df["pubdate"], errors="coerce")
    df.dropna(subset=["pubdate"], inplace=True)
    df[required_columns] = df[required_columns].fillna("")
    df["combined_text"] = (
        df["Title"] + " " +
        df["subjectTerms"] + " " +
        df["classification"] + " " +
        df["identifierKeywords"]
    ).str.strip()

    candidate_labels_for_california = ["California", "Not California"]
    print("Loading zero-shot pipeline for California detection...")
    zero_shot_california = pipeline(
        "zero-shot-classification",
        model="facebook/bart-large-mnli",
        device=device_id
    )

    print("Classifying for California relevance...")
    texts_ca = df["combined_text"].tolist()
    ca_results = batch_zero_shot_classification(
        texts=texts_ca,
        zero_shot_pipeline=zero_shot_california,
        candidate_labels=candidate_labels_for_california,
        batch_size=batch_size_ca
    )

    filtered_labels = soft_label_filter(ca_results)
    df["is_california"] = [lbl == "California" for lbl in filtered_labels]

    removed_df = df[~df["is_california"]].copy()
    kept_df = df[df["is_california"]].copy()
    print("Removed:", len(removed_df))
    removed_df.to_excel("removed_articles.xlsx", index=False)

    df = kept_df.reset_index(drop=True)

    print("Loading zero-shot pipeline for topic detection...")
    zero_shot_topic = pipeline(
        "zero-shot-classification",
        model="facebook/bart-large-mnli",
        device=device_id
    )

    # Define specific topic labels for better categorization
    candidate_labels_topics = [
        "Wildfires",
        "Climate Change",
        "Politics",
        "Economy",
        "Education",
        "Healthcare",
        "Technology",
        "Environmental Policy"
    ]

    df_texts = df["combined_text"].tolist()
    topic_results = batch_zero_shot_classification(
        texts=df_texts,
        zero_shot_pipeline=zero_shot_topic,
        candidate_labels=candidate_labels_topics,
        batch_size=batch_size_topics
    )

    predicted_categories = soft_label_filter(topic_results)
    df["predicted_category"] = predicted_categories

    # Filter for 2021 data
    df_2021 = df[df["pubdate"].dt.year == 2021].copy()
    df_2021["iso_year"] = df_2021["pubdate"].dt.isocalendar().year
    df_2021["iso_week"] = df_2021["pubdate"].dt.isocalendar().week
    df_2021["iso_day"] = df_2021["pubdate"].dt.isocalendar().day
    df_2021 = df_2021[df_2021["iso_year"] == 2021]

    # Group by week
    weekly_top_categories = {}
    for week_number in sorted(df_2021["iso_week"].unique()):
        subset_week = df_2021[df_2021["iso_week"] == week_number]
        if len(subset_week) == 0:
            continue
        top_cat = subset_week["predicted_category"].value_counts().idxmax()
        weekly_top_categories[week_number] = top_cat

    print("\n=== TOP CATEGORIES BY ISO WEEK (2021) ===")
    for week_num in range(1, 54):
        if week_num in weekly_top_categories:
            print(f"Week {week_num}: {weekly_top_categories[week_num]}")
        else:
            print(f"Week {week_num}: No data")

    # Group by month
    monthly_top_categories = {}
    for month_number in range(1, 13):
        subset_month = df_2021[df_2021["pubdate"].dt.month == month_number]
        if len(subset_month) == 0:
            continue
        top_cat = subset_month["predicted_category"].value_counts().idxmax()
        monthly_top_categories[month_number] = top_cat

    print("\n=== TOP CATEGORIES BY MONTH (2021) ===")
    for month_num in range(1, 13):
        if month_num in monthly_top_categories:
            print(f"Month {month_num}: {monthly_top_categories[month_num]}")
        else:
            print(f"Month {month_num}: No data")

    df_2021.to_excel("kept_and_categorized_articles_2021.xlsx", index=False)
    print("Done.")

if __name__ == "__main__":
    excel_file_path = r"C:\Users\CENSORED\Downloads\DocumentschatGPT.xlsx"
    main(excel_file_path)

There’s a major issue with that particular line of code: It defines its topics into very confined, basic categories, such as “wildfires, climate change, etc.” Which I believe may have caused failure in previous iterations. Also, that code does look very similar to what ChatGPT had given me, which usually (and unfortunately) has many inaccuracies for longer, more niche scripts.

1 Like

Chatbots usually work by responding with general information. You’ll need to modify this part to suit your needs.

    # Define specific topic labels for better categorization
    candidate_labels_topics = [
        "Wildfires",
        "Climate Change",
        "Politics",
        "Economy",
        "Education",
        "Healthcare",
        "Technology",
        "Environmental Policy"
    ]

You could also choose to consider all articles as candidates from the start, without filtering…

Edit:
I wonder if zero-shot classification is suited to selecting the most prominent topic…
It is a model for classifying categories.