diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 8342a80b84..e422b3d987 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -69,16 +69,6 @@ repos: name: Ruff (Jupyter Notebooks) args: [--fix] - - repo: local - hooks: - - id: check-links - name: Check Links in Python and md Files - entry: python ./build_scripts/check_links.py - language: python - files: ^doc.*\.(py|md)$ - additional_dependencies: ['requests'] - exclude: (release_process.md|git.md|^doc/deployment/|tests|pyrit/prompt_converter/morse_converter.py|.github|pyrit/prompt_converter/emoji_converter.py|pyrit/score/markdown_injection.py|^pyrit/datasets/|^pyrit/auxiliary_attacks/gcg/) - - repo: https://github.com/allganize/ty-pre-commit rev: v0.0.32 hooks: diff --git a/Makefile b/Makefile index d1de710acd..e42722b191 100644 --- a/Makefile +++ b/Makefile @@ -25,7 +25,8 @@ ty: docs-build: uv run python build_scripts/pydoc2json.py pyrit --submodules -o doc/_api/pyrit_all.json uv run python build_scripts/gen_api_md.py - cd doc && uv run jupyter-book build --all --html + # --strict validates URLs and cross-refs; skips are configured in doc/myst.yml under error_rules + cd doc && uv run jupyter-book build --all --html --strict uv run ./build_scripts/generate_rss.py # Build the full documentation site including the PDF export. @@ -36,7 +37,8 @@ docs-build: docs-build-all: uv run python build_scripts/pydoc2json.py pyrit --submodules -o doc/_api/pyrit_all.json uv run python build_scripts/gen_api_md.py - cd doc && uv run jupyter-book build --all --html --pdf + # --strict validates URLs and cross-refs; skips are configured in doc/myst.yml under error_rules + cd doc && uv run jupyter-book build --all --html --pdf --strict uv run ./build_scripts/generate_rss.py # Regenerate only the API reference pages (without building the full site) diff --git a/build_scripts/check_links.py b/build_scripts/check_links.py deleted file mode 100644 index 827a47c2a3..0000000000 --- a/build_scripts/check_links.py +++ /dev/null @@ -1,208 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT license. - -import os -import re -import sys -import time -from concurrent.futures import ThreadPoolExecutor, as_completed -from urllib.parse import urlsplit, urlunsplit - -import requests - -skipped_urls = [ - "https://cognitiveservices.azure.com/.default", - "https://gandalf.lakera.ai/api/send-message", - "https://code.visualstudio.com/Download", # This will block python requests - "https://platform.openai.com/docs/api-reference/introduction", # blocks python requests - "https://platform.openai.com/docs/api-reference/responses", # blocks python requests - "https://platform.openai.com/docs/guides/function-calling", # blocks python requests - "https://platform.openai.com/docs/guides/structured-outputs", # blocks python requests - "https://platform.openai.com/api-keys", # blocks python requests (requires auth) - "https://www.anthropic.com/research/many-shot-jailbreaking", # blocks python requests - "https://doi.org/10.1145/3749447", # ACM blocks automated requests - "https://azure.microsoft.com/free/", # Azure blocks automated requests - "https://code.visualstudio.com/docs/devcontainers/containers", - "https://stackoverflow.com/questions/77134272/pip-install-dev-with-pyproject-toml-not-working", - "https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-containers", -] - -custom_myst_references = ["notebook_tests", "mistralai_mixtral_8x7b_instruct_v0_1"] - -# Updated regex pattern to capture URLs from Markdown and HTML -URL_PATTERN = re.compile(r'\[.*?\]\((.*?)\)|href="([^"]+)"|src="([^"]+)"') - -# Pattern to capture :link: directives from MyST grid-item-cards -GRID_LINK_PATTERN = re.compile(r"^:link:\s+(.+)$", re.MULTILINE) - - -def extract_urls(file_path): - with open(file_path, encoding="utf-8") as file: - content = file.read() - matches = URL_PATTERN.findall(content) - # Flatten the list of tuples and filter out empty strings - urls = [strip_fragment(url) for match in matches for url in match if url] - - # Extract :link: directives from MyST grid-item-cards - grid_links = GRID_LINK_PATTERN.findall(content) - urls.extend(grid_links) - - return urls - - -def strip_fragment(url): - """ - Removes the fragment (#...) from the URL, so the base URL can be checked. - """ - parsed_url = urlsplit(url) - return urlunsplit((parsed_url.scheme, parsed_url.netloc, parsed_url.path, parsed_url.query, "")) - - -def resolve_relative_url(base_path, url): - if not url.startswith(("http://", "https://", "mailto:", "attachment:")): - # Handle MyST doc references (e.g., setup/1b_install_docker) - # These can be .md, .rst, or directory paths - abs_path = os.path.abspath(os.path.join(os.path.dirname(base_path), url)) - - # Check various possible file extensions for doc links - if not os.path.exists(abs_path): - for ext in [".md", ".ipynb"]: - if os.path.exists(abs_path + ext): - return abs_path + ext - - return abs_path - return url - - -def check_url(url, retries=2, delay=2): - """ - Check the validity of a URL, with retries if it fails. - - Args: - url (str): URL to check. - retries (int, optional): Number of retries if the URL check fails. Defaults to 2. - delay (int, optional): Delay in seconds between retries. Defaults to 2. - Returns: - tuple: A tuple containing the URL and a boolean indicating whether it is valid. - """ - - if ( - "http://localhost:" in url - or url in skipped_urls - or any(url.endswith(reference) for reference in custom_myst_references) - or os.path.isfile(url) - or os.path.isdir(url) - or url.startswith(("mailto:", "attachment:")) - ): - return url, True - - # If it's not an HTTP URL at this point, it's likely a broken local file reference - if not url.startswith(("http://", "https://")): - return url, False - - attempts = 0 - while attempts <= retries: - try: - response = requests.head(url, allow_redirects=True, timeout=5) - if response.status_code >= 400: - attempts += 1 - if attempts > retries: - return url, False - time.sleep(delay) - else: - return url, True - except requests.RequestException: - attempts += 1 - if attempts > retries: - return url, False - time.sleep(delay) - - # If we exit the loop without returning, the URL is broken - return url, False - - -def extract_all_urls_from_files(files): - """ - Extract all URLs from all files, returning a dict of {file_path: [urls]}. - """ - file_urls = {} - skipped_files = ["doc/blog/"] - - for file_path in files: - if any(file_path.startswith(skipped) for skipped in skipped_files): - continue - urls = extract_urls(file_path) - resolved_urls = [resolve_relative_url(file_path, url) for url in urls] - if resolved_urls: - file_urls[file_path] = resolved_urls - - return file_urls - - -def check_all_links_parallel(file_urls, max_workers=20): - """ - Check all URLs across all files in parallel with a shared thread pool. - - Args: - file_urls: Dict of {file_path: [urls]} - max_workers: Max concurrent HTTP requests across ALL files - - Returns: - Dict of {file_path: [broken_urls]} - """ - all_broken_urls = {} - - # Create a mapping of url -> file_path for tracking which file each URL came from - url_to_files = {} - for file_path, urls in file_urls.items(): - for url in urls: - if url not in url_to_files: - url_to_files[url] = [] - url_to_files[url].append(file_path) - - # Check all unique URLs in parallel - url_results = {} - with ThreadPoolExecutor(max_workers=max_workers) as executor: - futures = {executor.submit(check_url, url): url for url in url_to_files} - for future in as_completed(futures): - url = futures[future] - _, is_valid = future.result() - url_results[url] = is_valid - - # Map broken URLs back to their files - for url, is_valid in url_results.items(): - if not is_valid: - for file_path in url_to_files[url]: - if file_path not in all_broken_urls: - all_broken_urls[file_path] = [] - all_broken_urls[file_path].append(url) - - return all_broken_urls - - -if __name__ == "__main__": - files = sys.argv[1:] - - print(f"Extracting URLs from {len(files)} file(s)...") - file_urls = extract_all_urls_from_files(files) - - if not file_urls: - print("No URLs found to check.") - sys.exit(0) - - total_urls = sum(len(urls) for urls in file_urls.values()) - unique_urls = len({url for urls in file_urls.values() for url in urls}) - print(f"Checking {unique_urls} unique URL(s) across {len(file_urls)} file(s) (total: {total_urls})...") - - all_broken_urls = check_all_links_parallel(file_urls, max_workers=30) - - if all_broken_urls: - print("\n" + "=" * 80) - for file_path, urls in all_broken_urls.items(): - print(f"Broken links in {file_path}:") - for url in urls: - print(f" - {url}") - print("=" * 80) - sys.exit(1) - else: - print("No broken links found.") diff --git a/doc/blog/2025_01_27.md b/doc/blog/2025_01_27.md index a151ab321f..4c83af64b6 100644 --- a/doc/blog/2025_01_27.md +++ b/doc/blog/2025_01_27.md @@ -78,11 +78,11 @@ Finally, when PyRIT gets a response from the Target LLM, it switches to another When examining this request, you may discover that occasionally the Adversarial LLM struggles with generating the right JSON format, leading to an error in PyRIT, regardless of whether the objective was achieved or not. In such situation, it is helpful to inspect the requests to identify these types of issues. Specifically, I found a problem when the LLM response contained double quotes, causing issues with subsequent JSON formats which was fixed using the "SearchReplaceConverter"[^9] prompt converter. -[^7]: "Multi-Turn Attack - RedTeamingAttack Example", https://microsoft.github.io/PyRIT/code/executor/attack/2_red_teaming_attack.html +[^7]: "Multi-Turn Attack - RedTeamingAttack Example", ../code/executor/attack/2_red_teaming_attack.ipynb -[^8]: "PyRIT - SearchReplaceConverter", https://microsoft.github.io/PyRIT/_autosummary/pyrit.prompt_converter.SearchReplaceConverter.html +[^8]: "PyRIT - SearchReplaceConverter", ../api/pyrit_prompt_converter.md#searchreplaceconverter -[^9]: "PyRIT - True False Scoring", https://microsoft.github.io/PyRIT/code/scoring/2_true_false_scorers.html#true-false-scoring +[^9]: "PyRIT - True False Scoring", ../code/scoring/2_true_false_scorers.ipynb#true-false-scoring ### Final Thoughts diff --git a/doc/blog/2025_02_11.md b/doc/blog/2025_02_11.md index a8abfe1ae1..248f84413a 100644 --- a/doc/blog/2025_02_11.md +++ b/doc/blog/2025_02_11.md @@ -32,6 +32,6 @@ See the updated documentation [here](../code/datasets/1_loading_datasets.ipynb). ## What else can we do with this? -Now that we've loaded our dataset into PyRIT as a `SeedPromptDataset` the really exciting red teaming can begin. A great example of this is in our [Baseline-Only Execution](../code/scenarios/9_baseline_only.ipynb) notebook! We can use the prompts to evaluate the target by sending all the previously loaded prompts, modifying which attacks to use, and storing the scores for further analysis. +Now that we've loaded our dataset into PyRIT as a `SeedPromptDataset` the really exciting red teaming can begin. A great example of this is the [Baseline Execution](../code/scenarios/0_scenarios.ipynb#baseline-execution) section of our scenarios overview! We can use the prompts to evaluate the target by sending all the previously loaded prompts, modifying which attacks to use, and storing the scores for further analysis. In this blog post, we've walked through how we use structured datasets through our `SeedPrompt` and `SeedPromptDataset` classes. PyRIT's architecture allows for customization at every stage - whether through converters or configuring different scorers - and seed prompts set us up to effectively probe for risks in AI systems. Send over any contributions to add more datasets, refine seed prompts, or any open issues on Github! Now that you understand a core component of PyRIT, go ahead and try it out! diff --git a/doc/blog/2025_06_06.md b/doc/blog/2025_06_06.md index c9ff1e2a01..c32c43808b 100644 --- a/doc/blog/2025_06_06.md +++ b/doc/blog/2025_06_06.md @@ -12,7 +12,7 @@ The [AI Recruiter](https://github.com/KutalVolkan/ai_recruiter) is designed to m - Résumé Processing & Semantic Matching: Résumés are extracted from PDFs, with embeddings generated using models like text-embedding-ada-002. These embeddings enable semantic matching, while GPT-4o is later used to assign a match score based on relevance and extracted content. -- Automated RAG Vulnerability Testing: Attackers can manipulate résumé content by injecting hidden text (via a [PDF converter](https://github.com/microsoft/PyRIT/blob/main/doc/code/converters/pdf_converter.ipynb)) that optimizes scoring, influencing the AI Recruiter’s ranking system. +- Automated RAG Vulnerability Testing: Attackers can manipulate résumé content by injecting hidden text (via a [PDF converter](../code/converters/5_file_converters.ipynb#pdfconverter)) that optimizes scoring, influencing the AI Recruiter’s ranking system. - [XPIA Attack](https://github.com/microsoft/PyRIT/blob/main/doc/code/executor/workflow/2_xpia_ai_recruiter.ipynb) Integration: PyRIT enables full automation of prompt injections, making AI vulnerability research efficient and reproducible. --- diff --git a/doc/blog/2026_04_14_scoring_scorers.md b/doc/blog/2026_04_14_scoring_scorers.md index 9e7eec0cbb..9358a8762b 100644 --- a/doc/blog/2026_04_14_scoring_scorers.md +++ b/doc/blog/2026_04_14_scoring_scorers.md @@ -108,7 +108,7 @@ flowchart TB There are a few different ways to view metrics for specific scoring configurations. -**Directly on a scorer instance:** Call `get_scorer_metrics()` on any scorer object to look up its saved metrics (if they exist), as described at the bottom of the [Scorer Evaluation Identifier](#scorer-evaluation-identifier) section above. See the [scorer metrics notebook](../code/scoring/8_scorer_metrics.ipynb) to try it yourself! +**Directly on a scorer instance:** Call `get_scorer_metrics()` on any scorer object to look up its saved metrics (if they exist), as described at the bottom of the [Scorer Evaluation Identifier](#scorer-evaluation-identifier) section above. See the [scorer metrics notebook](../code/scoring/7_scorer_metrics.ipynb) to try it yourself! **Automatically in scenario output:** When running scenarios and printing results (i.e., in [pyrit_scan](../scanner/1_pyrit_scan.ipynb) or [pyrit_shell](../scanner/2_pyrit_shell.md)), metrics are automatically fetched and displayed alongside the attack results (as long as the scoring configuration has been evaluated before): @@ -132,7 +132,7 @@ The framework checks the JSONL registry for an existing entry matching the score ![alt text](2026_04_14_running_evaluation.png) -For the full walkthrough — including running objective and harm evaluations, configuring custom datasets, and comparing results — give the [scorer metrics notebook](../code/scoring/8_scorer_metrics.ipynb) a try! +For the full walkthrough — including running objective and harm evaluations, configuring custom datasets, and comparing results — give the [scorer metrics notebook](../code/scoring/7_scorer_metrics.ipynb) a try! ## Closing Thoughts diff --git a/doc/code/converters/1_text_to_text_converters.ipynb b/doc/code/converters/1_text_to_text_converters.ipynb index dfb2de1856..1e996efc5d 100644 --- a/doc/code/converters/1_text_to_text_converters.ipynb +++ b/doc/code/converters/1_text_to_text_converters.ipynb @@ -22,7 +22,7 @@ "id": "1", "metadata": {}, "source": [ - "\n", + "(non-llm-converters)=\n", "## Non-LLM Converters\n", "\n", "Non-LLM converters use deterministic algorithms to transform text. These include:\n", @@ -454,7 +454,7 @@ "id": "10", "metadata": {}, "source": [ - "\n", + "(llm-based-converters)=\n", "## LLM-Based Converters\n", "\n", "LLM-based converters use language models to transform prompts. These converters are more flexible and can produce more natural variations, but they are slower and require an LLM target.\n", diff --git a/doc/code/converters/1_text_to_text_converters.py b/doc/code/converters/1_text_to_text_converters.py index 67cc57919d..6237720753 100644 --- a/doc/code/converters/1_text_to_text_converters.py +++ b/doc/code/converters/1_text_to_text_converters.py @@ -22,7 +22,7 @@ # - **[LLM-Based Converters](#llm-based-converters)**: AI-powered transformations including translation, variation, and semantic modifications # %% [markdown] -# +# (non-llm-converters)= # ## Non-LLM Converters # # Non-LLM converters use deterministic algorithms to transform text. These include: @@ -225,7 +225,7 @@ print("Variation Selector:", await var_selector.convert_async(prompt=prompt)) # type: ignore # %% [markdown] -# +# (llm-based-converters)= # ## LLM-Based Converters # # LLM-based converters use language models to transform prompts. These converters are more flexible and can produce more natural variations, but they are slower and require an LLM target. diff --git a/doc/code/converters/2_audio_converters.ipynb b/doc/code/converters/2_audio_converters.ipynb index f125cba736..c5b199fd7e 100644 --- a/doc/code/converters/2_audio_converters.ipynb +++ b/doc/code/converters/2_audio_converters.ipynb @@ -23,7 +23,7 @@ "id": "1", "metadata": {}, "source": [ - "\n", + "(text-to-audio)=\n", "## Text to Audio\n", "\n", "The `AzureSpeechTextToAudioConverter` converts text input into audio output, generating spoken audio files." @@ -72,7 +72,7 @@ "id": "3", "metadata": {}, "source": [ - "\n", + "(audio-to-text)=\n", "## Audio to Text\n", "\n", "The `AzureSpeechAudioToTextConverter` transcribes audio files into text. Below we use the audio file created in the previous section." @@ -117,7 +117,7 @@ "id": "5", "metadata": {}, "source": [ - "\n", + "(audio-to-audio)=\n", "## Audio to Audio\n", "\n", "Audio-to-audio converters modify existing audio files. All of these converters accept `audio_path` input\n", @@ -240,7 +240,8 @@ ], "metadata": { "jupytext": { - "cell_metadata_filter": "-all" + "cell_metadata_filter": "-all", + "main_language": "python" }, "language_info": { "codemirror_mode": { diff --git a/doc/code/converters/2_audio_converters.py b/doc/code/converters/2_audio_converters.py index 127b5c2b15..7c226f0767 100644 --- a/doc/code/converters/2_audio_converters.py +++ b/doc/code/converters/2_audio_converters.py @@ -23,7 +23,7 @@ # - **[Audio to Audio](#audio-to-audio)**: Modify audio files (speed, volume, echo, frequency, noise) # %% [markdown] -# +# (text-to-audio)= # ## Text to Audio # # The `AzureSpeechTextToAudioConverter` converts text input into audio output, generating spoken audio files. @@ -45,7 +45,7 @@ assert os.path.exists(audio_convert_result.output_text) # %% [markdown] -# +# (audio-to-text)= # ## Audio to Text # # The `AzureSpeechAudioToTextConverter` transcribes audio files into text. Below we use the audio file created in the previous section. @@ -70,7 +70,7 @@ print(transcript) # %% [markdown] -# +# (audio-to-audio)= # ## Audio to Audio # # Audio-to-audio converters modify existing audio files. All of these converters accept `audio_path` input diff --git a/doc/code/converters/3_image_converters.ipynb b/doc/code/converters/3_image_converters.ipynb index 2e3277d34a..cf71440a31 100644 --- a/doc/code/converters/3_image_converters.ipynb +++ b/doc/code/converters/3_image_converters.ipynb @@ -22,7 +22,7 @@ "id": "1", "metadata": {}, "source": [ - "\n", + "(text-to-image)=\n", "## Text to Image\n", "\n", "### QRCodeConverter\n", @@ -148,7 +148,7 @@ "id": "5", "metadata": {}, "source": [ - "\n", + "(image-to-image)=\n", "## Image to Image\n", "\n", "### AddTextImageConverter\n", diff --git a/doc/code/converters/3_image_converters.py b/doc/code/converters/3_image_converters.py index 3bf7ecac80..846a468447 100644 --- a/doc/code/converters/3_image_converters.py +++ b/doc/code/converters/3_image_converters.py @@ -21,7 +21,7 @@ # - **[Image to Image](#image-to-image)**: Modify or transform existing images # %% [markdown] -# +# (text-to-image)= # ## Text to Image # # ### QRCodeConverter @@ -76,7 +76,7 @@ display(image) # %% [markdown] -# +# (image-to-image)= # ## Image to Image # # ### AddTextImageConverter diff --git a/doc/code/converters/4_video_converters.ipynb b/doc/code/converters/4_video_converters.ipynb index 12f675f8dd..53e3ed6bac 100644 --- a/doc/code/converters/4_video_converters.ipynb +++ b/doc/code/converters/4_video_converters.ipynb @@ -21,7 +21,7 @@ "id": "1", "metadata": {}, "source": [ - "\n", + "(image-to-video)=\n", "## Image to Video\n", "\n", "### AddImageVideoConverter\n", @@ -74,6 +74,9 @@ } ], "metadata": { + "jupytext": { + "main_language": "python" + }, "language_info": { "codemirror_mode": { "name": "ipython", diff --git a/doc/code/converters/4_video_converters.py b/doc/code/converters/4_video_converters.py index c537193c1a..0d0aec6bc6 100644 --- a/doc/code/converters/4_video_converters.py +++ b/doc/code/converters/4_video_converters.py @@ -20,7 +20,7 @@ # - **[Image to Video](#image-to-video)**: Add images to video files # %% [markdown] -# +# (image-to-video)= # ## Image to Video # # ### AddImageVideoConverter diff --git a/doc/code/memory/4_manually_working_with_memory.md b/doc/code/memory/4_manually_working_with_memory.md index f20af7a0e7..7c29b6846b 100644 --- a/doc/code/memory/4_manually_working_with_memory.md +++ b/doc/code/memory/4_manually_working_with_memory.md @@ -13,7 +13,7 @@ The second way to share data is to use local SQLite Memory (see [here](../memory 1. Export and import the database as described [here](https://dbeaver.com/docs/dbeaver/Data-transfer/). This allows a lot of flexibility and can include partial exports (for example based on labels or time): 2. Copy the PyRIT `results/dbdata` directory over; it will contain multi-modal data that the database references. -See https://dbeaver.com/docs/guides/sql_editors/sqlite.html for a more comprehensive guide on using DBeaver with SQLite. +See https://dbeaver.com/docs/dbeaver/Database-driver-SQLite/ for a more comprehensive guide on using DBeaver with SQLite. ## Using SQLite and Excel to Query and Visualize Data diff --git a/doc/contributing/10_release_process.md b/doc/contributing/10_release_process.md index a5aaa644fd..bb9ac4ae86 100644 --- a/doc/contributing/10_release_process.md +++ b/doc/contributing/10_release_process.md @@ -75,15 +75,15 @@ links work properly. Note: There may not be any links to update, but it is good practice to check in case our README changes. Replace all "main" links like -"https://github.com/microsoft/PyRIT/blob/main/doc/index.md" with "raw" links that have +`https://github.com/microsoft/PyRIT/blob/main/doc/index.md` with "raw" links that have the correct version number, i.e., -"https://raw.githubusercontent.com/microsoft/PyRIT/releases/vx.y.z/doc/index.md". +`https://raw.githubusercontent.com/microsoft/PyRIT/releases/vx.y.z/doc/index.md`. For images, update using the "raw" link, e.g., -"https://raw.githubusercontent.com/microsoft/PyRIT/releases/vx.y.z/assets/pyrit_architecture.png". +`https://raw.githubusercontent.com/microsoft/PyRIT/releases/vx.y.z/assets/pyrit_architecture.png`. For directories, update using the "tree" link, e.g., -"https://github.com/microsoft/PyRIT/tree/releases/vx.y.z/doc/code" +`https://github.com/microsoft/PyRIT/tree/releases/vx.y.z/doc/code` This is required for the release branch because PyPI does not pick up other files besides the README, which results in local links breaking. diff --git a/doc/contributing/3_style_guide.md b/doc/contributing/3_style_guide.md index 92e9d2ed38..192b65398b 100644 --- a/doc/contributing/3_style_guide.md +++ b/doc/contributing/3_style_guide.md @@ -26,7 +26,7 @@ Deviations from any particular rule can occur depending on context and need. - One parameter per line - Spaces not tabs. Tab value must be 4 spaces - Test names should be test_foo test_bar, test_baz, etc. -- In the case of type name conflicts, the desired type should be used in its fully-qualified (or disambiguating relatively-qualified) form: e.g. "pyrit.shared.foo" +- In the case of type name conflicts, the desired type should be used in its fully-qualified (or disambiguating relatively-qualified) form: e.g. `pyrit.shared.foo` - Naming should follow typical Python naming. e.g. some_descriptive_name - PyRIT imports go last (after a newline) - Imports should go in alphabetical order diff --git a/doc/contributing/5_unit_tests.md b/doc/contributing/5_unit_tests.md index 07eb74ce1a..b0d48ec879 100644 --- a/doc/contributing/5_unit_tests.md +++ b/doc/contributing/5_unit_tests.md @@ -13,4 +13,4 @@ Testing is an art to get right! But here are some best practices in terms of uni - Don't write to the actual database, use a `MagicMock` for the memory object or use `patch_central_database` as the database connection. -Not all of our current tests follow these practices (we're working on it!) But for some good examples, see [test_tts_send_prompt_file_save_async](../../tests/unit/target/test_tts_target.py), which has many of these best practices incorporated in the test. +Not all of our current tests follow these practices (we're working on it!) But for some good examples, see [test_tts_send_prompt_file_save_async](../../tests/unit/prompt_target/target/test_tts_target.py), which has many of these best practices incorporated in the test. diff --git a/doc/getting_started/troubleshooting/deploy_hf_model_aml.ipynb b/doc/getting_started/troubleshooting/deploy_hf_model_aml.ipynb index 254bea3bd1..2b5600ce5a 100644 --- a/doc/getting_started/troubleshooting/deploy_hf_model_aml.ipynb +++ b/doc/getting_started/troubleshooting/deploy_hf_model_aml.ipynb @@ -19,7 +19,7 @@ " pip install azure-ai-ml\n", " pip install azure-identity\n", " ```\n", - "- Execute the `az login` command to sign in to your Azure subscription. For detailed instructions, refer to the \"Authenticate with Azure Subscription\" section in the markdown file provided [here](../setup/populating_secrets.md)\n", + "- Execute the `az login` command to sign in to your Azure subscription. For detailed instructions, refer to the \"Authenticate with Azure Subscription\" section in the markdown file provided [here](../populating_secrets.md)\n", "- A Hugging Face model should be present in the AZURE ML model catalog. If it is missing, execute the [notebook](./download_and_register_hf_model_aml.ipynb) to download and register the Hugging Face model in the AZURE ML registry." ] }, @@ -311,6 +311,9 @@ } ], "metadata": { + "jupytext": { + "main_language": "python" + }, "language_info": { "codemirror_mode": { "name": "ipython", diff --git a/doc/getting_started/troubleshooting/download_and_register_hf_model_aml.ipynb b/doc/getting_started/troubleshooting/download_and_register_hf_model_aml.ipynb index 700e9a9583..29e7fdec00 100644 --- a/doc/getting_started/troubleshooting/download_and_register_hf_model_aml.ipynb +++ b/doc/getting_started/troubleshooting/download_and_register_hf_model_aml.ipynb @@ -37,7 +37,7 @@ " pip install azure-ai-ml\n", " pip install azure-identity\n", " ```\n", - "- Execute the `az login` command to sign in to your Azure subscription. For detailed instructions, refer to the \"Authenticate with Azure Subscription\" section [here](../setup/populating_secrets.md)" + "- Execute the `az login` command to sign in to your Azure subscription. For detailed instructions, refer to the \"Authenticate with Azure Subscription\" section [here](../populating_secrets.md)" ] }, { @@ -66,9 +66,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "lines_to_next_cell": 0 - }, + "metadata": {}, "outputs": [], "source": [ "# Import the Azure ML SDK components required for workspace connection and model management.\n", @@ -503,6 +501,9 @@ } ], "metadata": { + "jupytext": { + "main_language": "python" + }, "language_info": { "codemirror_mode": { "name": "ipython", diff --git a/doc/getting_started/troubleshooting/score_aml_endpoint.ipynb b/doc/getting_started/troubleshooting/score_aml_endpoint.ipynb index cc13c0ba1b..76aa4135e5 100644 --- a/doc/getting_started/troubleshooting/score_aml_endpoint.ipynb +++ b/doc/getting_started/troubleshooting/score_aml_endpoint.ipynb @@ -16,7 +16,7 @@ "Before proceeding with this notebook, ensure the following prerequisites are met:\n", "\n", "1. **Azure ML Model Deployment**: Your Azure ML model must be deployed to an Azure ML managed online endpoint. If your model is not yet deployed, please follow the instructions in the [deployment notebook](./deploy_hf_model_aml.ipynb).\n", - "2. Execute the `az login` command to sign in to your Azure subscription. For detailed instructions, refer to the \"Authenticate with Azure Subscription\" section [here](../setup/populating_secrets.md)\n", + "2. Execute the `az login` command to sign in to your Azure subscription. For detailed instructions, refer to the \"Authenticate with Azure Subscription\" section [here](../populating_secrets.md)\n", "\n", "\n", "## Environment Variables\n", @@ -89,7 +89,9 @@ "cell_type": "code", "execution_count": null, "id": "4", - "metadata": {}, + "metadata": { + "lines_to_next_cell": 2 + }, "outputs": [ { "name": "stdout", @@ -173,6 +175,9 @@ } ], "metadata": { + "jupytext": { + "main_language": "python" + }, "language_info": { "codemirror_mode": { "name": "ipython", diff --git a/doc/myst.yml b/doc/myst.yml index df83376e47..d055e0c7b2 100644 --- a/doc/myst.yml +++ b/doc/myst.yml @@ -11,6 +11,31 @@ project: - format: pdf template: plain_latex_book output: exports/book.pdf + # See https://mystmd.org/guide for error_rules schema. + # Rule IDs come from https://github.com/jupyter-book/mystmd + # (packages/myst-common/src/ruleids.ts). + error_rules: + # mystmd v2.x has no LaTeX renderer for the MyST layout directives we use + # extensively in the HTML site (grid, tabSet, details, mermaid). They + # render fine in HTML; PDF export emits "Unhandled LaTeX conversion for + # node of X" errors per node. Track upstream support and remove this + # suppression when those renderers land. + - rule: tex-renders + severity: ignore + + # External APIs that require auth — always return 401/403 from CI workers + # and public link checkers. The URLs themselves are correct; they just + # can't be verified without credentials. Every other broken URL in the + # tree has been fixed (see PR #1745); only add to this list when a URL + # is genuinely unverifiable, not just inconvenient to update. + - rule: link-resolves + severity: ignore + keys: + - "https://platform.openai.com/**" + - "https://api.openai.com/**" + - "http://api.openai.com" + - "http://api.openai.com/**" + - "https://cognitiveservices.azure.com/**" toc: - file: index.md - file: getting_started/README.md diff --git a/pyrit/models/storage_io.py b/pyrit/models/storage_io.py index 9f050b4c62..3491b4b749 100644 --- a/pyrit/models/storage_io.py +++ b/pyrit/models/storage_io.py @@ -300,9 +300,9 @@ async def read_file(self, path: Union[Path, str]) -> bytes: """ Asynchronously reads the content of a file (blob) from Azure Blob Storage. - If the provided `path` is a full URL - (e.g., "https://account.blob.core.windows.net/container/dir1/dir2/sample.png"), - it extracts the relative blob path (e.g., "dir1/dir2/sample.png") to correctly access the blob. + If the provided ``path`` is a full URL + (e.g., ``https://account.blob.core.windows.net/container/dir1/dir2/sample.png``), + it extracts the relative blob path (e.g., ``dir1/dir2/sample.png``) to correctly access the blob. If a relative path is provided, it will use it as-is. Args: @@ -313,10 +313,11 @@ async def read_file(self, path: Union[Path, str]) -> bytes: bytes: The content of the file (blob) as bytes. Example: - file_content = - await read_file("https://account.blob.core.windows.net/container/dir2/1726627689003831.png") - # Or using a relative path: - file_content = await read_file("dir1/dir2/1726627689003831.png") + ``file_content = await read_file("https://account.blob.core.windows.net/container/dir2/1726627689003831.png")`` + + Or using a relative path: + + ``file_content = await read_file("dir1/dir2/1726627689003831.png")`` """ if not self._client_async: diff --git a/tests/unit/build_scripts/test_check_links.py b/tests/unit/build_scripts/test_check_links.py deleted file mode 100644 index 75bc4aec45..0000000000 --- a/tests/unit/build_scripts/test_check_links.py +++ /dev/null @@ -1,88 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT license. - -from pathlib import Path - -from build_scripts.check_links import extract_urls, resolve_relative_url, strip_fragment - - -def test_strip_fragment_removes_fragment() -> None: - assert strip_fragment("https://example.com/page#section") == "https://example.com/page" - - -def test_strip_fragment_no_fragment_unchanged() -> None: - assert strip_fragment("https://example.com/page") == "https://example.com/page" - - -def test_strip_fragment_empty_fragment() -> None: - assert strip_fragment("https://example.com/page#") == "https://example.com/page" - - -def test_strip_fragment_preserves_query_string() -> None: - result = strip_fragment("https://example.com/page?q=1#section") - assert "q=1" in result - assert "section" not in result - - -def test_resolve_relative_url_http_url_unchanged() -> None: - url = "https://example.com" - assert resolve_relative_url("/some/file.md", url) == url - - -def test_resolve_relative_url_mailto_unchanged() -> None: - url = "mailto:test@example.com" - assert resolve_relative_url("/some/file.md", url) == url - - -def test_resolve_relative_url_resolved(tmp_path: Path) -> None: - base = str(tmp_path / "docs" / "file.md") - target = tmp_path / "docs" / "other.md" - target.parent.mkdir(parents=True, exist_ok=True) - target.write_text("# Other") - result = resolve_relative_url(base, "other.md") - assert result == str(target) - - -def test_resolve_relative_url_with_md_extension(tmp_path: Path) -> None: - base = str(tmp_path / "docs" / "file.md") - target = tmp_path / "docs" / "other.md" - target.parent.mkdir(parents=True, exist_ok=True) - target.write_text("# Other") - result = resolve_relative_url(base, "other") - assert result.endswith(".md") - - -def test_extract_urls_extracts_markdown_links(tmp_path: Path) -> None: - f = tmp_path / "test.md" - f.write_text("[Click here](https://example.com)") - urls = extract_urls(str(f)) - assert "https://example.com" in urls - - -def test_extract_urls_extracts_href_links(tmp_path: Path) -> None: - f = tmp_path / "test.html" - f.write_text('link') - urls = extract_urls(str(f)) - assert "https://example.com" in urls - - -def test_extract_urls_extracts_src_links(tmp_path: Path) -> None: - f = tmp_path / "test.html" - f.write_text('') - urls = extract_urls(str(f)) - assert "https://example.com/image.png" in urls - - -def test_extract_urls_empty_file_returns_no_urls(tmp_path: Path) -> None: - f = tmp_path / "empty.md" - f.write_text("") - urls = extract_urls(str(f)) - assert urls == [] - - -def test_extract_urls_strips_fragments(tmp_path: Path) -> None: - f = tmp_path / "test.md" - f.write_text("[link](https://example.com/page#section)") - urls = extract_urls(str(f)) - assert "https://example.com/page" in urls - assert not any("#section" in u for u in urls)