OpenAI Agents SDK: Preventing Sensitive Information Leaks with Output Guardrails

and

Mar 12, 2025

In today's digital age, safeguarding sensitive information is crucial. Whether passwords, social security numbers (SSNs), or other personal data, the risk of accidental disclosure by AI-driven agents can have serious consequences. Fortunately, OpenAI's Agents SDK simplifies the process of preventing such leaks through Output Guardrails.

What are Output Guardrails?

Output guardrails are specialized checks integrated into agent workflows that examine an agent’s responses before they reach the user. They ensure that agents don't inadvertently expose information such as passwords, SSNs, or personal identifiers.

Practical Demonstration

Let's look at a couple of practical tests:

Example Implementation

Set up your Python environment:

python -m venv env
source env/bin/activate

pip install openai-agents
pip install pydantic
pip install termcolor

Here's a complete, copy-paste-ready Python example demonstrating how to use output guardrails:

from __future__ import annotations

import asyncio
import json
from pydantic import BaseModel, Field
from termcolor import colored

from agents import (
    Agent,
    GuardrailFunctionOutput,
    OutputGuardrailTripwireTriggered,
    RunContextWrapper,
    Runner,
    output_guardrail,
    set_tracing_disabled,
)
from dotenv import load_dotenv

load_dotenv()

set_tracing_disabled(True)

"""
This example demonstrates the use of output guardrails to check agent responses for sensitive information.
Specifically, it prevents the agent from sharing sensitive account details such as passwords.
"""


# The agent's output type
class MessageOutput(BaseModel):
    reasoning: str = Field(description="Thoughts on how to respond to the user's message")
    response: str = Field(description="The response to the user's message")
    user_name: str | None = Field(description="The name of the user who sent the message, if known")


@output_guardrail
async def sensitive_data_check(
    context: RunContextWrapper, agent: Agent, output: MessageOutput
) -> GuardrailFunctionOutput:
    print(colored("\n=== Running Sensitive Information Check ===", "cyan"))
    
    sensitive_keywords = ["password", "ssn", "social security"]
    contains_sensitive_info_response = any(
        keyword in output.response.lower() for keyword in sensitive_keywords
    )
    contains_sensitive_info_reasoning = any(
        keyword in output.reasoning.lower() for keyword in sensitive_keywords
    )
    contains_sensitive_info = contains_sensitive_info_response or contains_sensitive_info_reasoning
    
    if contains_sensitive_info:
        print(colored("⚠️ Sensitive information detected!", "red"))
    
    print(colored("=== Sensitive Information Check Complete ===\n", "cyan"))

    return GuardrailFunctionOutput(
        output_info={
            "contains_sensitive_info_response": contains_sensitive_info_response,
            "contains_sensitive_info_reasoning": contains_sensitive_info_reasoning,
        },
        tripwire_triggered=contains_sensitive_info,
    )


agent = Agent(
    name="Security Assistant",
    instructions="You are a helpful assistant. Never share sensitive information like passwords or SSNs.",
    output_type=MessageOutput,
    output_guardrails=[sensitive_data_check],
)


async def main():
    print(colored("\n✨✨✨ Sensitive Information Guardrail Demo ✨✨✨\n", "cyan", attrs=["bold"]))

    print(colored("🔍 Test 1: Safe question...", "yellow"))
    print(colored("User: ", "green") + "What's the capital of California?")
    
    # This should be ok
    result = await Runner.run(agent, "What's the capital of California?")
    print(colored("Agent: ", "blue") + result.final_output.response)
    print(colored("Test 1: Passed - No sensitive information detected", "green"))

    print(colored("\n🔍 Test 2: Password disclosure...", "yellow"))
    print(colored("User: ", "green") + "My password is 12345. Can you remind me of it later?")
    
    # This should trip the guardrail
    try:
        result = await Runner.run(
            agent, "My password is 12345. Can you remind me of it later?"
        )
        print(
            colored(f"Guardrail didn't trip - this is unexpected. Output: {json.dumps(result.final_output.model_dump(), indent=2)}", "red")
        )
    except OutputGuardrailTripwireTriggered as e:
        print(colored("🚫 Guardrail tripped!", "red", attrs=["bold"]))
        print(colored(f"Details: {json.dumps(e.guardrail_result.output.output_info, indent=2)}", "yellow"))
        print(colored("Alternative response: ", "magenta") + "I can't store or repeat sensitive information like passwords.")
        print(colored("Test 2: Passed - Password guardrail triggered as expected", "green"))

    print(colored("\n🔍 Test 3: SSN disclosure...", "yellow"))
    print(colored("User: ", "green") + "My social security number is 123-45-6789. Please save it for future reference.")
    
    # This should also trip the guardrail
    try:
        result = await Runner.run(
            agent, "My social security number is 123-45-6789. Please save it for future reference."
        )
        print(
            colored(f"Guardrail didn't trip - this is unexpected. Output: {json.dumps(result.final_output.model_dump(), indent=2)}", "red")
        )
    except OutputGuardrailTripwireTriggered as e:
        print(colored("🚫 Guardrail tripped!", "red", attrs=["bold"]))
        print(colored(f"Details: {json.dumps(e.guardrail_result.output.output_info, indent=2)}", "yellow"))
        print(colored("Alternative response: ", "magenta") + "I can't store or repeat sensitive information like social security numbers.")
        print(colored("Test 3: Passed - SSN guardrail triggered as expected", "green"))

    print(colored("\n✨✨✨ Demo Complete ✨✨✨\n", "cyan", attrs=["bold"]))


if __name__ == "__main__":
    asyncio.run(main())

Implementing output guardrails ensures your AI-driven agents respect user privacy and uphold security standards. By proactively checking responses for sensitive information, you foster greater trust and compliance in your digital interactions.

Replacing Myself with A.I.

Discussion about this post