awesome-deepseek-integration/deepseek_content_moderation/tests/test_moderator.py
google-labs-jules[bot] 800d05606f feat: Add new content moderation project
This commit introduces a new project, `deepseek_content_moderation`, designed to detect sensitive content in text based on configurable keyword lists.

Key features include:
- Customizable categories of sensitive words stored in `config.json`.
- A `Moderator` class (`moderator.py`) that loads the configuration and uses regex for case-insensitive, whole-word matching.
- The `analyze_text` method returns a dictionary of triggered categories and the specific words found.
- Comprehensive unit tests (`tests/test_moderator.py`) using pytest ensure the functionality of the `Moderator` class.
- A detailed `README.md` provides an overview, setup instructions, usage examples, and testing guidelines.

The project structure has been set up to be a valid Python package, with the main directory named `deepseek_content_moderation`.
This project serves as a foundational component for applications requiring basic content filtering capabilities.
2025-06-14 06:02:05 +00:00

111 lines
5.1 KiB
Python

import pytest
import json
import os
from deepseek_content_moderation.moderator import Moderator # Adjusted import path
# Helper to create a temporary config file for testing
@pytest.fixture
def temp_config_file(tmp_path):
config_data = {
"Profanity": ["badword", "swear"],
"HateSpeech": ["hateful_term", "slur"],
"SpecificCategory": ["unique_term_for_test"]
}
config_file = tmp_path / "test_config.json"
with open(config_file, 'w') as f:
json.dump(config_data, f)
return str(config_file) # Return path as string
@pytest.fixture
def moderator_instance(temp_config_file):
# Ensure the moderator uses the temp config by passing the path
return Moderator(config_path=temp_config_file)
def test_config_loading(moderator_instance):
assert "Profanity" in moderator_instance.config
assert "swear" in moderator_instance.config["Profanity"]
assert "HateSpeech" in moderator_instance.category_regexes
assert moderator_instance.category_regexes["Profanity"].pattern == r"\b(badword|swear)\b"
def test_analyze_text_no_sensitivities(moderator_instance):
analysis = moderator_instance.analyze_text("This is a clean sentence.")
assert analysis == {}
def test_analyze_text_single_category_single_word(moderator_instance):
analysis = moderator_instance.analyze_text("This sentence contains a badword.")
assert "Profanity" in analysis
assert analysis["Profanity"] == ["badword"]
def test_analyze_text_single_category_multiple_words(moderator_instance):
analysis = moderator_instance.analyze_text("This sentence has badword and also swear.")
assert "Profanity" in analysis
assert sorted(analysis["Profanity"]) == sorted(["badword", "swear"])
def test_analyze_text_multiple_categories(moderator_instance):
analysis = moderator_instance.analyze_text("A sentence with badword and a hateful_term.")
assert "Profanity" in analysis
assert analysis["Profanity"] == ["badword"]
assert "HateSpeech" in analysis
assert analysis["HateSpeech"] == ["hateful_term"]
def test_analyze_text_case_insensitivity(moderator_instance):
analysis = moderator_instance.analyze_text("This has a BADWORD and HATEFUL_TERM.")
assert "Profanity" in analysis
assert analysis["Profanity"] == ["BADWORD"] # The regex returns the found casing
assert "HateSpeech" in analysis
assert analysis["HateSpeech"] == ["HATEFUL_TERM"]
def test_analyze_text_empty_string(moderator_instance):
analysis = moderator_instance.analyze_text("")
assert analysis == {}
def test_analyze_text_words_within_words_whole_word_matching(moderator_instance):
# 'swear' is a keyword, 'swearinger' is not.
analysis = moderator_instance.analyze_text("He is swearinger but not swear.")
assert "Profanity" in analysis
assert analysis["Profanity"] == ["swear"]
# Test with a word that is a substring of a sensitive word, but not a whole word match
analysis_substring = moderator_instance.analyze_text("This is just a test, not a hateful_term at all.")
assert "HateSpeech" in analysis_substring
assert analysis_substring["HateSpeech"] == ["hateful_term"]
analysis_no_match = moderator_instance.analyze_text("This sentence has a term but not the specific unique_term_for_testing.")
assert "SpecificCategory" not in analysis_no_match
def test_analyze_text_repeated_words(moderator_instance):
analysis = moderator_instance.analyze_text("This badword is a badword again badword.")
assert "Profanity" in analysis
assert analysis["Profanity"] == ["badword"] # Should only list unique matches
def test_analyze_text_with_punctuation(moderator_instance):
analysis = moderator_instance.analyze_text("Is this a badword? Yes, badword!")
assert "Profanity" in analysis
assert analysis["Profanity"] == ["badword"]
analysis_slur = moderator_instance.analyze_text("No slur, okay?")
assert "HateSpeech" in analysis_slur
assert analysis_slur["HateSpeech"] == ["slur"]
# It's good practice to ensure the test file can be found and run.
# Create a dummy __init__.py in the parent directory of 'tests' if moderator.py is in the root
# and tests are in a subdirectory, to make Python treat 'deepseek-content-moderation' as a package.
# For this subtask, we assume the structure is:
# deepseek-content-moderation/
# moderator.py
# config.json
# tests/
# test_moderator.py
# To run these tests, navigate to the `deepseek-content-moderation` directory and run `pytest`.
# Ensure `pytest` is installed (`pip install pytest`).
# If `moderator.py` is in the root of `deepseek-content-moderation`, the import
# `from ..moderator import Moderator` is for when tests are run as part of a package.
# If running `pytest` directly from within the `tests` directory, or if `deepseek-content-moderation`
# is not treated as a package, a simple `from moderator import Moderator` might be needed,
# and `sys.path` manipulation or running pytest with `python -m pytest` from the root.
# For this tool, we will ensure the structure supports `from ..moderator import Moderator`.
# This requires an `__init__.py` in the `deepseek-content-moderation` directory.