Hello all. I’m working on a research project, in which I analyze major events from California spanning from 1/1/2021 to 12/31/2021. The project’s data is held on an Excel spreadsheet, in which the following data is provided: Title, pubdate (published date), document URL, identifier key (some key things/subjects about the article), Subject Term (who’s the main person/group/association the article relates to), and Objects (very similar to subject term). I’m using HuggingFace’s RoBERTa for this project, as well as llama and zero shot classification. I’ve tasked the AI with doing the following: Remove articles that are not related to California, then group the articles by week (52 weeks total), then summarize the most prominent event that week (For example, if a majority of news articles in week 16 talk about a new policy, than, Week 16: New Policy.). The AI was tasked with figuring out whether or not an article is related to California, and if it is, what’s its talking about by looking at the Title and the identifier keys.
The Issue: When I ran the code, it had presented me no results. What it basically spewed out was:
Week 1: Blank
Week 2: Blank
Week 3: Blank
…
Week 52: Blank
I’m perplexed as to why no results had been shown, for I couldn’t figure out any issues with my code. Do you see any potential issues as to why there had been no results?
Code:
import pandas as pd
import numpy as np
from transformers import pipeline
import torch
from datetime import datetime
def batch_zero_shot_classification(
texts,
zero_shot_pipeline,
candidate_labels,
hypothesis_template="This text is about {}.",
batch_size=16
):
"""
Classify a list of input texts in batches using a zero-shot classification pipeline.
Returns a list of dicts, each dict containing the pipeline output for a single text.
:param texts: List of strings to classify.
:param zero_shot_pipeline: A Hugging Face zero-shot classification pipeline.
:param candidate_labels: A list of possible labels (strings).
:param hypothesis_template: Template for hypothesis (default for English).
:param batch_size: Number of texts to classify at once.
:return: List of classification results (each is a dict).
"""
all_results = []
n = len(texts)
for start_i in range(0, n, batch_size):
end_i = min(start_i + batch_size, n)
batch_texts = texts[start_i:end_i]
# Run batch classification
batch_outputs = zero_shot_pipeline(
batch_texts,
candidate_labels=candidate_labels,
hypothesis_template=hypothesis_template
)
# If the pipeline returns a single dict for some reason, wrap it
if isinstance(batch_outputs, dict):
batch_outputs = [batch_outputs]
all_results.extend(batch_outputs)
return all_results
def main(excel_file_path, sheet_name=0):
# ------------------------------------------------
# 0. DEVICE CONFIG & PARAMS
# ------------------------------------------------
device_id = 0 if torch.cuda.is_available() else -1
batch_size_ca = 16
batch_size_topics = 16
# ------------------------------------------------
# 1. LOAD DATA
# ------------------------------------------------
print("Reading Excel file...")
df = pd.read_excel(excel_file_path, sheet_name=sheet_name, dtype=str)
# Ensure required columns exist
required_columns = ["Title", "subjectTerms", "classification", "identifierKeywords", "pubdate"]
for col in required_columns:
if col not in df.columns:
raise ValueError(f"Required column '{col}' is missing from the Excel file.")
# Convert pubdate to datetime
df["pubdate"] = pd.to_datetime(df["pubdate"], errors="coerce")
df.dropna(subset=["pubdate"], inplace=True)
# Combine text columns
df[required_columns] = df[required_columns].fillna("")
df["combined_text"] = (
df["Title"] + " " +
df["subjectTerms"] + " " +
df["classification"] + " " +
df["identifierKeywords"]
).str.strip()
# ------------------------------------------------
# 2. FILTER: IS THE ARTICLE ABOUT CALIFORNIA?
# ------------------------------------------------
candidate_labels_for_california = ["California", "Not California"]
threshold_ca = 0.5
print("Loading zero-shot pipeline for California detection...")
zero_shot_california = pipeline(
"zero-shot-classification",
model="facebook/bart-large-mnli",
device=device_id
)
print("Running batch zero-shot classification to detect CA-relevant articles...")
texts_ca = df["combined_text"].tolist()
ca_results = batch_zero_shot_classification(
texts=texts_ca,
zero_shot_pipeline=zero_shot_california,
candidate_labels=candidate_labels_for_california,
batch_size=batch_size_ca
)
# Determine which articles are about CA
is_california_list = []
for result in ca_results:
labels = result["labels"]
scores = result["scores"]
top_label = labels[0]
top_score = scores[0]
is_ca = (top_label == "California") and (top_score >= threshold_ca)
is_california_list.append(is_ca)
df["is_california"] = is_california_list
# Separate data
removed_df = df[~df["is_california"]].copy()
kept_df = df[df["is_california"]].copy()
print("Number of articles removed (Not about California):", len(removed_df))
removed_df.to_excel("removed_articles.xlsx", index=False)
# Focus only on California ones
df = kept_df.reset_index(drop=True)
# ------------------------------------------------
# 3. TOPIC CATEGORIZATION
# ------------------------------------------------
candidate_categories = [
"weather",
"government election",
"new policy",
"protest",
"crime",
"health",
"business",
"environment",
"sports",
"education",
]
print("Loading zero-shot pipeline for topic categorization...")
zero_shot_topic = pipeline(
"zero-shot-classification",
model="facebook/bart-large-mnli",
device=device_id
)
print("Running batch topic classification...")
df_texts = df["combined_text"].tolist()
topic_results = batch_zero_shot_classification(
texts=df_texts,
zero_shot_pipeline=zero_shot_topic,
candidate_labels=candidate_categories,
batch_size=batch_size_topics
)
predicted_categories = []
for result in topic_results:
top_label = result["labels"][0]
predicted_categories.append(top_label)
df["predicted_category"] = predicted_categories
# ------------------------------------------------
# 4. GROUP BY ISO WEEKS (Monday–Sunday, 2021)
# ------------------------------------------------
df_2021 = df[df["pubdate"].dt.year == 2021].copy()
# Use the built-in isocalendar to get ISO year, week, day.
# ISO weeks start on Monday and end on Sunday.
df_2021["iso_year"] = df_2021["pubdate"].dt.isocalendar().year
df_2021["iso_week"] = df_2021["pubdate"].dt.isocalendar().week
df_2021["iso_day"] = df_2021["pubdate"].dt.isocalendar().day
# If you only want 2021's ISO weeks, filter iso_year == 2021.
df_2021 = df_2021[df_2021["iso_year"] == 2021]
# For each iso_week in 2021, find the most frequent category
weekly_top_categories = {}
for week_number in sorted(df_2021["iso_week"].unique()):
subset_week = df_2021[df_2021["iso_week"] == week_number]
if len(subset_week) == 0:
continue
top_cat = subset_week["predicted_category"].value_counts().idxmax()
weekly_top_categories[week_number] = top_cat
# ------------------------------------------------
# 5. OUTPUT RESULTS
# ------------------------------------------------
print("\n=== TOP CATEGORIES BY ISO WEEK (MONDAY–SUNDAY, 2021) ===")
for week_num in range(1, 54):
if week_num in weekly_top_categories:
print(f"ISO Week {week_num}: {weekly_top_categories[week_num]}")
else:
print(f"ISO Week {week_num}: No articles found / No data")
df_2021.to_excel("kept_and_categorized_articles_2021.xlsx", index=False)
print("\nDone. Final results saved to kept_and_categorized_articles_2021.xlsx")
if __name__ == "__main__":
# Update this path to point to your local file:
# Use a raw string (r"string") or double backslashes in Windows paths
excel_file_path = r"C:\Users\CENSORED\Downloads\DocumentschatGPT.xlsx"
main(excel_file_path)