Hello all! I’m having an error message with this code:
import pandas as pd
import numpy as np
from transformers import pipeline
import torch
from scipy.stats import entropy
from datetime import datetime
def batch_zero_shot_classification(
texts,
zero_shot_pipeline,
candidate_labels,
hypothesis_template=“This text is about {}.”,
batch_size=16
):
all_results =
n = len(texts)
for start_i in range(0, n, batch_size):
end_i = min(start_i + batch_size, n)
batch_texts = texts[start_i:end_i]
batch_outputs = zero_shot_pipeline(
batch_texts,
candidate_labels=candidate_labels,
hypothesis_template=hypothesis_template
)
if isinstance(batch_outputs, dict):
batch_outputs = [batch_outputs]
all_results.extend(batch_outputs)
return all_results
def soft_label_filter(results, min_confidence=0.5, entropy_threshold=1.2):
clean =
for result in results:
scores = result[“scores”]
labels = result[“labels”]
top_label = labels[0]
top_score = scores[0]
label_entropy = entropy(scores)
if top_score >= min_confidence and label_entropy <= entropy_threshold:
clean.append(top_label)
else:
clean.append(“uncertain”)
return clean
def main(excel_file_path, sheet_name=0):
device_id = 0 if torch.cuda.is_available() else -1
batch_size_ca = 16
batch_size_topics = 16
print("Reading Excel file...")
df = pd.read_excel(excel_file_path, sheet_name=sheet_name, dtype=str)
required_columns = ["Title", "subjectTerms", "classification", "identifierKeywords", "pubdate"]
for col in required_columns:
if col not in df.columns:
raise ValueError(f"Required column '{col}' is missing from the Excel file.")
df["pubdate"] = pd.to_datetime(df["pubdate"], errors="coerce")
df.dropna(subset=["pubdate"], inplace=True)
df[required_columns] = df[required_columns].fillna("")
df["combined_text"] = (
df["Title"] + " " +
df["subjectTerms"] + " " +
df["classification"] + " " +
df["identifierKeywords"]
).str.strip()
candidate_labels_for_california = ["California", "Not California"]
print("Loading zero-shot pipeline for California detection...")
zero_shot_california = pipeline(
"zero-shot-classification",
model="facebook/bart-large-mnli",
device=device_id
)
print("Classifying for California relevance...")
texts_ca = df["combined_text"].tolist()
ca_results = batch_zero_shot_classification(
texts=texts_ca,
zero_shot_pipeline=zero_shot_california,
candidate_labels=candidate_labels_for_california,
batch_size=batch_size_ca
)
filtered_labels = soft_label_filter(ca_results)
df["is_california"] = [lbl == "California" for lbl in filtered_labels]
removed_df = df[~df["is_california"]].copy()
kept_df = df[df["is_california"]].copy()
print("Removed:", len(removed_df))
removed_df.to_excel("removed_articles.xlsx", index=False)
df = kept_df.reset_index(drop=True)
print("Loading zero-shot pipeline for topic detection...")
zero_shot_topic = pipeline(
"zero-shot-classification",
model="facebook/bart-large-mnli",
device=device_id
)
df_texts = df["combined_text"].tolist()
topic_results = batch_zero_shot_classification(
texts=df_texts,
zero_shot_pipeline=zero_shot_topic,
candidate_labels=[ ], # Let AI generate new categories
batch_size=batch_size_topics
)
predicted_categories = soft_label_filter(topic_results)
df["predicted_category"] = predicted_categories
df_2021 = df[df["pubdate"].dt.year == 2021].copy()
df_2021["iso_year"] = df_2021["pubdate"].dt.isocalendar().year
df_2021["iso_week"] = df_2021["pubdate"].dt.isocalendar().week
df_2021["iso_day"] = df_2021["pubdate"].dt.isocalendar().day
df_2021 = df_2021[df_2021["iso_year"] == 2021]
weekly_top_categories = { }
for week_number in sorted(df_2021["iso_week"].unique()):
subset_week = df_2021[df_2021["iso_week"] == week_number]
if len(subset_week) == 0:
continue
top_cat = subset_week["predicted_category"].value_counts().idxmax()
weekly_top_categories[week_number] = top_cat
print("\\n=== TOP CATEGORIES BY ISO WEEK (2021) ===")
for week_num in range(1, 54):
if week_num in weekly_top_categories:
print(f"ISO Week {week_num}: {weekly_top_categories[week_num]}")
else:
print(f"ISO Week {week_num}: No data")
df_2021.to_excel("kept_and_categorized_articles_2021.xlsx", index=False)
print("Done.")
if name == “main”:
excel_file_path = r"C:\Users\CENSORED\Downloads\DocumentschatGPT.xlsx"
main(excel_file_path)
import numpy as np
import pandas as pd
def external_gamma_per_row(df, columns, base_column=“col_4”):
gamma = df[base_column] / len(columns)
gamma = gamma.replace(0, 1e-6)
adjusted_columns = [ ]
for col in columns:
adjusted = df[col] / gamma
adjusted_columns.append(adjusted)
result = np.prod(adjusted_columns, axis=0)
return result
Here’s some background information:
major events from California spanning from 1/1/2021 to 12/31/2021. The project’s data is held on an Excel spreadsheet, in which the following data is provided: Title, pubdate (published date), document URL, identifier key (some key things/subjects about the article), Subject Term (who’s the main person/group/association the article relates to), and Objects (very similar to subject term). I’m using HuggingFace’s RoBERTa for this project, as well as llama and zero shot classification. I’ve tasked the AI with doing the following: Remove articles that are not related to California, then group the articles by week (52 weeks total), then summarize the most prominent event that week (For example, if a majority of news articles in week 16 talk about a new policy, than, Week 16: New Policy.). The AI was tasked with figuring out whether or not an article is related to California, and if it is, what’s its talking about by looking at the Title and the identifier keys.
The results should look like:
Week 1: (X)
Week 2: (X)
…
Week 52: (x)
And
Month 1: (x)
…
Month 12: (x).
Any advice?