After experimenting with different approaches in Jupyter notebooks and deciding on the best route to take, I’m ready to move my Notebook code into something that we can use for the final site.
I collaborated with ChatGPT to help me modularize and comment out the messy Notebook code into something that exists in individual Python scripts.
Here’s where we ended up for the data retrieval script that will grab City Council closed captions from Archive.org:
import os
import json
import logging
from datetime import datetime, timedelta
from internetarchive import search_items, get_item, download, get_files
# Function to retrieve item slugs based on a search query
def get_slugs_from_query(query):
# Perform a search on Archive.org using the provided query
search_results = search_items(query)
# Extract and return just the 'identifier' field from each search result
slugs = [item['identifier'] for item in search_results]
return slugs
# Function to get the start and end dates for a given range of weeks
def get_date_range(weeks=3):
# Current date and time in UTC
end_date = datetime.utcnow()
# Calculate the start date by subtracting the number of weeks from the current date
start_date = end_date - timedelta(weeks=weeks)
# Return both dates in 'YYYY-MM-DD' format
return start_date.strftime('%Y-%m-%d'), end_date.strftime('%Y-%m-%d')
# Function to construct a search query with a date range and subject filter
def build_query(subject, start_date, end_date):
# Create and return a search query string using the Archive.org advanced search syntax
return f'subject:"{subject}" AND date:[{start_date} TO {end_date}]'
# Function to convert a date string to ISO 8601 format
def convert_date_to_iso(date_str):
# Parse the date string into a datetime object
dt = datetime.strptime(date_str, '%Y-%m-%d %H:%M:%S')
# Convert to ISO 8601 format and return
return dt.isoformat() + 'Z'
# Function to update the date fields in the metadata to ISO format
def format_metadata(metadata):
# List of metadata fields that contain dates
for date_field in ['publicdate', 'addeddate']:
# If the field is present in the metadata, update it to ISO format
if date_field in metadata:
metadata[date_field] = convert_date_to_iso(metadata[date_field])
# Return the updated metadata
return metadata
# Function to download meeting files and metadata
def download_meeting(slug, output_directory):
print(f"download_meeting() called with slug: {slug}") # Log which meeting is being processed
try:
# Retrieve item and its metadata from Archive.org
item = get_item(slug)
metadata = item.metadata
# Format the date fields in metadata
metadata = format_metadata(metadata)
# Prepare a directory to save the downloaded files
meeting_dir = os.path.join(output_directory, f'Council_Meetings_{slug}')
# Create the directory if it doesn't exist
os.makedirs(meeting_dir, exist_ok=True)
# Look for text files in the item's files list
txt_files = [f for f in item.files if f['name'].lower().endswith('.txt')]
if txt_files:
# If text files are found, download each one
for file_dict in txt_files:
file_name = file_dict['name']
download(slug, files=file_name, destdir=meeting_dir, no_directory=True)
print(f'Downloaded text file for {slug}: {file_name}')
else:
# Log if no text files are found for this item
print(f'No text files found for {slug}')
# Save the formatted metadata to a JSON file in the meeting directory
save_metadata(meeting_dir, metadata)
except Exception as e:
# If any errors occur, log them
logging.error(f"Error downloading meeting with slug: {slug}. Error: {e}")
# Function to save metadata as a JSON file
def save_metadata(directory, metadata):
# Open a new JSON file in write mode within the specified directory
with open(os.path.join(directory, 'metadata.json'), 'w', encoding='utf-8') as f:
# Write the metadata dictionary to the file in a readable JSON format
json.dump(metadata, f, ensure_ascii=False, indent=4)
# High-level function to orchestrate the downloading of all meetings
def download_and_print_all_meetings(query, output_directory):
# Retrieve slugs from the search query
slugs = get_slugs_from_query(query)
for slug in slugs:
try:
# Download the meeting and its metadata for each slug
download_meeting(slug, output_directory)
# Log the successful download and processing
print(f'Downloaded and processed meeting with slug: {slug}')
except Exception as e:
# Log any failures during the download process
logging.error(f"Failed to download or process meeting with slug: {slug}. Error: {e}")
# Main entry point of the script
if __name__ == '__main__':
# Set up logging with the level of INFO
logging.basicConfig(level=logging.INFO)
# Define the search subject and get the date range
subject = "Cincinnati City Council"
start_date, end_date = get_date_range(weeks=3)
# Build the query using the subject and date range
query = build_query(subject, start_date, end_date)
# Define the directory where downloaded files will be stored
output_directory = 'output/'
# Log the start of the download process
logging.info(f'Starting download for meetings from {start_date} to {end_date}')
# Begin the download process
download_and_print_all_meetings(query, output_directory)
We’re now able to specify a custom start and end date, as well as the date range for the amount of meetings we want to get. I’m going to stick with the latest three meetings as I still have some more work to figure out how best to present these and if I want to add any other information such as an audio clip reading the summary or some sort of generated image to accompany it.
Here’s the code for the summarization using the latest GPT4 model from OpenAI:
import os
import openai
import json
from dotenv import load_dotenv
# Load the environment variables from the .env file
load_dotenv()
# Access the OpenAI API key
openai_api_key = os.getenv('OPENAI_API_KEY')
# Use the OpenAI API key (for example, by setting it in the OpenAI library configuration)
openai.api_key = openai_api_key
# Global variable for system message
SYSTEM_MESSAGE = {
"role": "system",
"content": """
You are an expert journalist focusing on local politics, tasked with summarizing weekly Cincinnati City Council meetings in a manner that is both accurate and engaging for the general public. Your summary should:
- Be approximately 500 words, reading like a news article with a balanced mix of straightforward reporting and narrative elements to captivate readers.
- Highlight key facts, figures, and numbers, particularly those pertaining to council actions related to finances, budgets, and measures with immediate implications for residents.
- Prioritize clarity and relevance, ensuring information is significant to the actions and decisions made by the council.
- Distinguish between deliberation and decisive actions, focusing on binding outcomes.
- Identify and analyze the top 5 most significant topics based on their potential impact on the community, scale of investment, or degree of controversy.
- Discuss the top 5 most actionable items, characterized by opportunities for citizen involvement or critical points for public accountability.
Write this in a manner that's easy to engage with and understand. It may be used as a transcript for a recording, but don't use words like "I" or "You" or anything casual.
Format the summary with a clear introduction, body, and conclusion, including a separate bullet point list at the end for both sets of 5 items.
Respond in JSON
"""
}
# Function to read the system message, currently redundant, but useful if reading from a file or database in the future.
def read_system_message():
# This returns a dictionary object containing the role and content for the system message.
return {
"role": "system",
"content": """
[System message content]
"""
}
# Function to summarize the content of a meeting text file.
def summarize_meeting(file_path, model="gpt-4-1106-preview"):
# Open and read the content of the given file path.
with open(file_path, 'r') as file:
input_text = file.read()
# User message to be sent to OpenAI, which contains the text to be summarized.
user_message = {
"role": "user",
"content": input_text
}
# OpenAI API call to create a chat completion, which generates the summary based on the system and user messages.
response = openai.ChatCompletion.create(
model=model,
messages=[SYSTEM_MESSAGE, user_message],
seed=123456,
response_format={
"type": "json_object",
}
)
# The response from the API call is returned.
return response
# Function to save the summary into a new file.
def save_summary(file_path, summary):
# Modify the file path to create a new file name for the summary.
summary_file_path = file_path.replace('.txt', '_summary.json')
# Open and write the summary to the file in JSON format, with indentation for readability.
with open(summary_file_path, 'w') as file:
json.dump(summary, file, ensure_ascii=False, indent=4)
# Function to summarize all meetings found within a given root directory.
def summarize_all_meetings(root_dir):
# Walk through the directory structure, starting from `root_dir`.
for dirpath, _, filenames in os.walk(root_dir):
# Loop through each file in the current directory.
for filename in filenames:
# Check if the current file is a text file.
if filename.endswith('.txt'):
# Construct the full file path.
file_path = os.path.join(dirpath, filename)
# Generate a summary for the given text file.
summary = summarize_meeting(file_path)
# Save the summary to a new file.
save_summary(file_path, summary)
# Print out completion message when all files have been processed.
print("Summarization completed for all files.")
# This conditional is Python's way to execute code only when the script is run directly, not when imported as a module.
if __name__ == '__main__':
# Define the root directory where the text files are stored.
root_dir = 'output/'
# Begin the summarization process for all meeting text files in the root directory.
summarize_all_meetings(root_dir)
I’m happy with what we have so far and excited to take it to the next level. Right now most of the information is being stored as JSON and I want to begin to create a database schema that will help me preserve and reuse as much of this as possible going forward. I’m going to work with ChatGPT to help me define and create one. After this I’ll want to see if I can begin to use the database directly instead of referencing txt
files. We’ll then be able to begin to experiment with some of the new GPT4 features from OpenAI, specifically text-to-speech and the DALLE-3 image generation APIs.