Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/bibliography.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,6 @@ All academic papers, research blogs, and technical reports referenced throughout
:::{dropdown} Citation Keys
:class: hidden-citations

[@aakanksha2024multilingual; @adversaai2023universal; @andriushchenko2024tense; @anthropic2024manyshot; @aqrawi2024singleturncrescendo; @bethany2024mathprompt; @bhardwaj2023harmfulqa; @bryan2025agentictaxonomy; @bullwinkel2025airtlessons; @bullwinkel2025repeng; @bullwinkel2026trigger; @chao2023pair; @chao2024jailbreakbench; @cui2024orbench; @darkbench2025; @derczynski2024garak; @ding2023wolf; @embracethered2024unicode; @embracethered2025sneakybits; @ghosh2025aegis; @haider2024phi3safety; @han2024medsafetybench; @hines2024spotlighting; @ji2023beavertails; @ji2024pkusaferlhf; @jiang2025sosbench; @jones2025computeruse; @kingma2014adam; @li2024saladbench; @li2024wmdp; @lin2023toxicchat; @liu2024flipattack; @lopez2024pyrit; @lv2024codechameleon; @mazeika2023tdc; @mazeika2024harmbench; @mckee2024transparency; @mehrotra2023tap; @microsoft2024skeletonkey; @palaskar2025vlsu; @pfohl2024equitymedqa; @promptfoo2025ccp; @robustintelligence2024bypass; @roccia2024promptintel; @rottger2023xstest; @russinovich2024crescendo; @russinovich2025price; @scheuerman2025transphobia; @shayegani2025computeruse; @shen2023donotanything; @sheshadri2024lat; @stok2023ansi; @tan2026comicjailbreak; @tang2025multilingual; @tedeschi2024alert; @vantaylor2024socialbias; @vidgen2023simplesafetytests; @vidgen2024ailuminate; @wang2023decodingtrust; @wang2023donotanswer; @wei2023jailbroken; @xie2024sorrybench; @yu2023gptfuzzer; @yuan2023cipherchat; @zeng2024persuasion; @zhang2024cbtbench; @zou2023gcg]
[@aakanksha2024multilingual; @adversaai2023universal; @andriushchenko2024tense; @anthropic2024manyshot; @aqrawi2024singleturncrescendo; @bethany2024mathprompt; @bhardwaj2023harmfulqa; @bryan2025agentictaxonomy; @bullwinkel2025airtlessons; @bullwinkel2025repeng; @bullwinkel2026trigger; @chao2023pair; @chao2024jailbreakbench; @cui2024orbench; @darkbench2025; @derczynski2024garak; @ding2023wolf; @embracethered2024unicode; @embracethered2025sneakybits; @ghosh2025aegis; @gupta2024walledeval; @haider2024phi3safety; @han2024medsafetybench; @hines2024spotlighting; @ji2023beavertails; @ji2024pkusaferlhf; @jiang2025sosbench; @jones2025computeruse; @kingma2014adam; @li2024saladbench; @li2024wmdp; @lin2023toxicchat; @liu2024flipattack; @lopez2024pyrit; @lv2024codechameleon; @mazeika2023tdc; @mazeika2024harmbench; @mckee2024transparency; @mehrotra2023tap; @microsoft2024skeletonkey; @palaskar2025vlsu; @pfohl2024equitymedqa; @promptfoo2025ccp; @robustintelligence2024bypass; @roccia2024promptintel; @rottger2023xstest; @russinovich2024crescendo; @russinovich2025price; @scheuerman2025transphobia; @shayegani2025computeruse; @shen2023donotanything; @sheshadri2024lat; @stok2023ansi; @tan2026comicjailbreak; @tang2025multilingual; @tedeschi2024alert; @vantaylor2024socialbias; @vidgen2023simplesafetytests; @vidgen2024ailuminate; @wang2023decodingtrust; @wang2023donotanswer; @wei2023jailbroken; @xie2024sorrybench; @yu2023gptfuzzer; @yuan2023cipherchat; @zeng2024persuasion; @zhang2024cbtbench; @zou2023gcg]

:::
8 changes: 8 additions & 0 deletions doc/references.bib
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,14 @@ @article{bhardwaj2023harmfulqa
note = {Introduces the {HarmfulQA} dataset},
}

@article{gupta2024walledeval,
title = {{WalledEval}: A Comprehensive Safety Evaluation Toolkit for Large Language Models},
author = {Prannaya Gupta and Le Qi Yau and Hao Han Low and I-Shiang Lee and Hugo Maximus Lim and Yu Xin Teoh and Jia Hng Koh and Dar Win Liew and Rishabh Bhardwaj and Rajat Bhardwaj and Soujanya Poria},
journal = {arXiv preprint arXiv:2408.03837},
year = {2024},
url = {https://arxiv.org/abs/2408.03837},
}

@article{palaskar2025vlsu,
title = {{VLSU}: Mapping the Limits of Joint Multimodal Understanding for {AI} Safety},
author = {Shruti Palaskar and Leon Gatys and Mona Abdelrahman and Mar Jacobo and Larry Lindsey and Rutika Moharir and Gunnar Lund and Yang Xu and Navid Shiee and Jeffrey Bigham and Charles Maalouf and Joseph Yitan Cheng},
Expand Down
6 changes: 6 additions & 0 deletions pyrit/datasets/seed_datasets/remote/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,10 @@
from pyrit.datasets.seed_datasets.remote.harmful_qa_dataset import (
_HarmfulQADataset,
) # noqa: F401
from pyrit.datasets.seed_datasets.remote.hixstest_dataset import (
HiXSTestLanguage,
_HiXSTestDataset,
) # noqa: F401
from pyrit.datasets.seed_datasets.remote.jbb_behaviors_dataset import (
_JBBBehaviorsDataset,
) # noqa: F401
Expand Down Expand Up @@ -125,6 +129,7 @@
) # noqa: F401

__all__ = [
"HiXSTestLanguage",
"PromptIntelCategory",
"PromptIntelSeverity",
"VLGuardCategory",
Expand All @@ -145,6 +150,7 @@
"_HarmBenchDataset",
"_HarmBenchMultimodalDataset",
"_HarmfulQADataset",
"_HiXSTestDataset",
"_JBBBehaviorsDataset",
"_LibrAIDoNotAnswerDataset",
"_LLMLatentAdversarialTrainingDataset",
Expand Down
189 changes: 189 additions & 0 deletions pyrit/datasets/seed_datasets/remote/hixstest_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import logging
import os
from enum import Enum

from pyrit.datasets.seed_datasets.remote.remote_dataset_loader import (
_RemoteDatasetLoader,
)
from pyrit.models import SeedDataset, SeedPrompt

logger = logging.getLogger(__name__)


class HiXSTestLanguage(Enum):
"""
Language to use as the primary ``value`` of each HiXSTest SeedPrompt.

HINDI: Use the original Hindi prompt (the dataset's intended evaluation).
ENGLISH: Use the provided English translation. Useful for sanity-checking
the corresponding English semantics or for English-only pipelines.
"""

HINDI = "hi"
ENGLISH = "en"


class _HiXSTestDataset(_RemoteDatasetLoader):
"""
Loader for the HiXSTest (Hindi Exaggerated-Safety Test) dataset from HuggingFace.

HiXSTest is a manually-curated set of 50 exaggerated-safety prompts in Hindi (with
English translations), companion to SGXSTest. It tests whether language models exhibit
exaggerated-safety behavior (refusing benign prompts whose harmful interpretation is
not warranted in Hindi cultural context).

Each example contains:
- prompt: the prompt text in Hindi
- english_prompt: English translation of the prompt
- label: "safe" or "unsafe"
- category: the polysemous Hindi trigger word being tested (e.g. "मारना")
Comment thread
romanlutz marked this conversation as resolved.

By default the Hindi prompt is used as the ``SeedPrompt.value``. Pass
``language=HiXSTestLanguage.ENGLISH`` to use the English translation instead.
Both the Hindi and English texts are always preserved in ``metadata`` as
``hindi_prompt`` and ``english_prompt``.

Note: This is a gated dataset on HuggingFace. You must accept the terms at
https://huggingface.co/datasets/walledai/HiXSTest before use, and provide a
HuggingFace token (either via the ``token`` parameter or the
``HUGGINGFACE_TOKEN`` environment variable).

References:
- https://huggingface.co/datasets/walledai/HiXSTest
- [@gupta2024walledeval]
License: Apache-2.0
"""

HF_DATASET_NAME: str = "walledai/HiXSTest"

# Class-level dataset metadata for SeedDatasetMetadata discovery
modalities: list[str] = ["text"]
size: str = "small" # 50 seeds
tags: set[str] = {"default", "safety", "multilingual"}

def __init__(
self,
*,
language: HiXSTestLanguage = HiXSTestLanguage.HINDI,
split: str = "train",
token: str | None = None,
) -> None:
"""
Initialize the HiXSTest dataset loader.

Args:
language: Which language to use as the primary ``SeedPrompt.value``.
Defaults to ``HiXSTestLanguage.HINDI`` (the dataset's intended language).
Pass ``HiXSTestLanguage.ENGLISH`` to use the English translation instead.
split: Dataset split to load. Defaults to "train" (the only split).
token: Hugging Face authentication token. If not provided, reads from the
``HUGGINGFACE_TOKEN`` environment variable.

Raises:
ValueError: If ``language`` is not a ``HiXSTestLanguage`` instance.
"""
self._validate_enum(language, HiXSTestLanguage, "language")
self.language = language
self.split = split
self.token = token if token is not None else os.environ.get("HUGGINGFACE_TOKEN")

@property
def dataset_name(self) -> str:
"""Return the dataset name."""
return "hixstest"

async def fetch_dataset_async(self, *, cache: bool = True) -> SeedDataset:
"""
Fetch HiXSTest dataset from HuggingFace and return as SeedDataset.

Args:
cache: Whether to cache the fetched dataset. Defaults to True.

Returns:
SeedDataset: A SeedDataset containing the HiXSTest prompts. The
``SeedPrompt.value`` is the Hindi prompt by default, or the English
translation when ``language=HiXSTestLanguage.ENGLISH``. Both texts
are always stored in ``metadata`` as ``hindi_prompt`` and
``english_prompt`` alongside ``label`` and ``category``.
"""
logger.info(f"Loading HiXSTest dataset from {self.HF_DATASET_NAME} (language={self.language.value})")

data = await self._fetch_from_huggingface(
dataset_name=self.HF_DATASET_NAME,
split=self.split,
cache=cache,
token=self.token,
)

authors = [
"Prannaya Gupta",
"Le Qi Yau",
"Hao Han Low",
"I-Shiang Lee",
"Hugo Maximus Lim",
"Yu Xin Teoh",
"Jia Hng Koh",
"Dar Win Liew",
"Rishabh Bhardwaj",
"Rajat Bhardwaj",
"Soujanya Poria",
]
description = (
"HiXSTest contains 50 manually-curated exaggerated-safety prompts in Hindi "
"(with English translations), companion to SGXSTest. It tests whether language "
"models exhibit exaggerated-safety behavior in a Hindi cultural context. "
"Introduced in 'WalledEval: A Comprehensive Safety Evaluation Toolkit for "
"Large Language Models' (2024)."
)

source_url = f"https://huggingface.co/datasets/{self.HF_DATASET_NAME}"
groups = ["Walled AI", "DeCLaRe Lab, Singapore University of Technology and Design"]

seed_prompts = [
SeedPrompt(
value=self._select_value(item),
data_type="text",
dataset_name=self.dataset_name,
harm_categories=[item["category"]] if item.get("category") else [],
description=description,
source=source_url,
authors=authors,
groups=groups,
metadata={
"hindi_prompt": item.get("prompt", ""),
"english_prompt": item.get("english_prompt", ""),
"label": item.get("label", ""),
"category": item.get("category", ""),
"language": self.language.value,
},
)
for item in data
]

logger.info(f"Successfully loaded {len(seed_prompts)} prompts from HiXSTest dataset")

return SeedDataset(seeds=seed_prompts, dataset_name=self.dataset_name)

def _select_value(self, item: dict) -> str:
"""
Return the prompt text to use as ``SeedPrompt.value`` based on ``self.language``.

Args:
item (dict): A single row from the HiXSTest dataset.

Returns:
str: The prompt text in the configured language.

Raises:
ValueError: If the selected language's prompt field is missing or empty.
"""
key = "english_prompt" if self.language is HiXSTestLanguage.ENGLISH else "prompt"
value = item.get(key)
if not value:
raise ValueError(
f"HiXSTest row is missing required field '{key}' for language={self.language.value}: {item!r}"
)
return value
Loading
Loading