Skip to content

Moving toward production

Posted on:November 9, 2023 at 04:35 PM

After experimenting with different approaches in Jupyter notebooks and deciding on the best route to take, I’m ready to move my Notebook code into something that we can use for the final site.

I collaborated with ChatGPT to help me modularize and comment out the messy Notebook code into something that exists in individual Python scripts.

Here’s where we ended up for the data retrieval script that will grab City Council closed captions from Archive.org:

import os
import json
import logging
from datetime import datetime, timedelta
from internetarchive import search_items, get_item, download, get_files

# Function to retrieve item slugs based on a search query
def get_slugs_from_query(query):
    # Perform a search on Archive.org using the provided query
    search_results = search_items(query)
    # Extract and return just the 'identifier' field from each search result
    slugs = [item['identifier'] for item in search_results]
    return slugs

# Function to get the start and end dates for a given range of weeks
def get_date_range(weeks=3):
    # Current date and time in UTC
    end_date = datetime.utcnow()
    # Calculate the start date by subtracting the number of weeks from the current date
    start_date = end_date - timedelta(weeks=weeks)
    # Return both dates in 'YYYY-MM-DD' format
    return start_date.strftime('%Y-%m-%d'), end_date.strftime('%Y-%m-%d')

# Function to construct a search query with a date range and subject filter
def build_query(subject, start_date, end_date):
    # Create and return a search query string using the Archive.org advanced search syntax
    return f'subject:"{subject}" AND date:[{start_date} TO {end_date}]'

# Function to convert a date string to ISO 8601 format
def convert_date_to_iso(date_str):
    # Parse the date string into a datetime object
    dt = datetime.strptime(date_str, '%Y-%m-%d %H:%M:%S')
    # Convert to ISO 8601 format and return
    return dt.isoformat() + 'Z'

# Function to update the date fields in the metadata to ISO format
def format_metadata(metadata):
    # List of metadata fields that contain dates
    for date_field in ['publicdate', 'addeddate']:
        # If the field is present in the metadata, update it to ISO format
        if date_field in metadata:
            metadata[date_field] = convert_date_to_iso(metadata[date_field])
    # Return the updated metadata
    return metadata

# Function to download meeting files and metadata
def download_meeting(slug, output_directory):
    print(f"download_meeting() called with slug: {slug}")  # Log which meeting is being processed

    try:
        # Retrieve item and its metadata from Archive.org
        item = get_item(slug)
        metadata = item.metadata
        # Format the date fields in metadata
        metadata = format_metadata(metadata)

        # Prepare a directory to save the downloaded files
        meeting_dir = os.path.join(output_directory, f'Council_Meetings_{slug}')
        # Create the directory if it doesn't exist
        os.makedirs(meeting_dir, exist_ok=True)

        # Look for text files in the item's files list
        txt_files = [f for f in item.files if f['name'].lower().endswith('.txt')]
        if txt_files:
            # If text files are found, download each one
            for file_dict in txt_files:
                file_name = file_dict['name']
                download(slug, files=file_name, destdir=meeting_dir, no_directory=True)
                print(f'Downloaded text file for {slug}: {file_name}')
        else:
            # Log if no text files are found for this item
            print(f'No text files found for {slug}')

        # Save the formatted metadata to a JSON file in the meeting directory
        save_metadata(meeting_dir, metadata)

    except Exception as e:
        # If any errors occur, log them
        logging.error(f"Error downloading meeting with slug: {slug}. Error: {e}")

# Function to save metadata as a JSON file
def save_metadata(directory, metadata):
    # Open a new JSON file in write mode within the specified directory
    with open(os.path.join(directory, 'metadata.json'), 'w', encoding='utf-8') as f:
        # Write the metadata dictionary to the file in a readable JSON format
        json.dump(metadata, f, ensure_ascii=False, indent=4)

# High-level function to orchestrate the downloading of all meetings
def download_and_print_all_meetings(query, output_directory):
    # Retrieve slugs from the search query
    slugs = get_slugs_from_query(query)
    for slug in slugs:
        try:
            # Download the meeting and its metadata for each slug
            download_meeting(slug, output_directory)
            # Log the successful download and processing
            print(f'Downloaded and processed meeting with slug: {slug}')
        except Exception as e:
            # Log any failures during the download process
            logging.error(f"Failed to download or process meeting with slug: {slug}. Error: {e}")

# Main entry point of the script
if __name__ == '__main__':
    # Set up logging with the level of INFO
    logging.basicConfig(level=logging.INFO)
    # Define the search subject and get the date range
    subject = "Cincinnati City Council"
    start_date, end_date = get_date_range(weeks=3)
    # Build the query using the subject and date range
    query = build_query(subject, start_date, end_date)
    # Define the directory where downloaded files will be stored
    output_directory = 'output/'
    # Log the start of the download process
    logging.info(f'Starting download for meetings from {start_date} to {end_date}')
    # Begin the download process
    download_and_print_all_meetings(query, output_directory)

We’re now able to specify a custom start and end date, as well as the date range for the amount of meetings we want to get. I’m going to stick with the latest three meetings as I still have some more work to figure out how best to present these and if I want to add any other information such as an audio clip reading the summary or some sort of generated image to accompany it.

Here’s the code for the summarization using the latest GPT4 model from OpenAI:

import os
import openai
import json
from dotenv import load_dotenv

# Load the environment variables from the .env file
load_dotenv()

# Access the OpenAI API key
openai_api_key = os.getenv('OPENAI_API_KEY')

# Use the OpenAI API key (for example, by setting it in the OpenAI library configuration)
openai.api_key = openai_api_key

# Global variable for system message
SYSTEM_MESSAGE = {
    "role": "system",
    "content": """
    You are an expert journalist focusing on local politics, tasked with summarizing weekly Cincinnati City Council meetings in a manner that is both accurate and engaging for the general public. Your summary should:
    - Be approximately 500 words, reading like a news article with a balanced mix of straightforward reporting and narrative elements to captivate readers.
    - Highlight key facts, figures, and numbers, particularly those pertaining to council actions related to finances, budgets, and measures with immediate implications for residents.
    - Prioritize clarity and relevance, ensuring information is significant to the actions and decisions made by the council.
    - Distinguish between deliberation and decisive actions, focusing on binding outcomes.
    - Identify and analyze the top 5 most significant topics based on their potential impact on the community, scale of investment, or degree of controversy.
    - Discuss the top 5 most actionable items, characterized by opportunities for citizen involvement or critical points for public accountability.
    Write this in a manner that's easy to engage with and understand. It may be used as a transcript for a recording, but don't use words like "I" or "You" or anything casual.
    Format the summary with a clear introduction, body, and conclusion, including a separate bullet point list at the end for both sets of 5 items.
    Respond in JSON
    """
}

# Function to read the system message, currently redundant, but useful if reading from a file or database in the future.
def read_system_message():
    # This returns a dictionary object containing the role and content for the system message.
    return {
        "role": "system",
        "content": """
        [System message content]
        """
    }

# Function to summarize the content of a meeting text file.
def summarize_meeting(file_path, model="gpt-4-1106-preview"):
    # Open and read the content of the given file path.
    with open(file_path, 'r') as file:
        input_text = file.read()

    # User message to be sent to OpenAI, which contains the text to be summarized.
    user_message = {
        "role": "user",
        "content": input_text
    }

    # OpenAI API call to create a chat completion, which generates the summary based on the system and user messages.
    response = openai.ChatCompletion.create(
        model=model,
        messages=[SYSTEM_MESSAGE, user_message],
        seed=123456,
        response_format={
            "type": "json_object",
        }
    )
    # The response from the API call is returned.
    return response

# Function to save the summary into a new file.
def save_summary(file_path, summary):
    # Modify the file path to create a new file name for the summary.
    summary_file_path = file_path.replace('.txt', '_summary.json')
    # Open and write the summary to the file in JSON format, with indentation for readability.
    with open(summary_file_path, 'w') as file:
        json.dump(summary, file, ensure_ascii=False, indent=4)

# Function to summarize all meetings found within a given root directory.
def summarize_all_meetings(root_dir):
    # Walk through the directory structure, starting from `root_dir`.
    for dirpath, _, filenames in os.walk(root_dir):
        # Loop through each file in the current directory.
        for filename in filenames:
            # Check if the current file is a text file.
            if filename.endswith('.txt'):
                # Construct the full file path.
                file_path = os.path.join(dirpath, filename)
                # Generate a summary for the given text file.
                summary = summarize_meeting(file_path)
                # Save the summary to a new file.
                save_summary(file_path, summary)
    # Print out completion message when all files have been processed.
    print("Summarization completed for all files.")

# This conditional is Python's way to execute code only when the script is run directly, not when imported as a module.
if __name__ == '__main__':
    # Define the root directory where the text files are stored.
    root_dir = 'output/'
    # Begin the summarization process for all meeting text files in the root directory.
    summarize_all_meetings(root_dir)

I’m happy with what we have so far and excited to take it to the next level. Right now most of the information is being stored as JSON and I want to begin to create a database schema that will help me preserve and reuse as much of this as possible going forward. I’m going to work with ChatGPT to help me define and create one. After this I’ll want to see if I can begin to use the database directly instead of referencing txt files. We’ll then be able to begin to experiment with some of the new GPT4 features from OpenAI, specifically text-to-speech and the DALLE-3 image generation APIs.