diff --git a/datasets/quick-start.mdx b/datasets/quick-start.mdx new file mode 100644 index 0000000..8f7ebf5 --- /dev/null +++ b/datasets/quick-start.mdx @@ -0,0 +1,63 @@ +--- +title: "Quick Start" +--- + + + + + + +Datasets are simple data tables that you can use to manage your data for experiments and evaluation of your AI applications. +Datasets are available in the SDK, and they enable you to create versioned snapshots for reproducible testing. + + + + +Click **New Dataset** to create a dataset, give it a descriptive name that reflects its purpose or use case, add a description to help your team understand its context, and provide a slug that allows you to use the dataset in the SDK. + + + + + +Add rows and columns to structure your dataset. +You can add different column types: +- **Text**: For prompts, model responses, or any textual data +- **Number**: For numerical values, scores, or metrics +- **Boolean**: For true/false flags or binary classifications + + + Use meaningful column names that clearly describe what each field contains, + making it easier to work with your dataset in code, ensure clarity when using evaluators, and collaborate with team members. + + + + + + + + + + + +Once you're satisfied with your dataset structure and data: +1. Click **Publish Version** to create a stable snapshot +2. Published versions are immutable +3. Publish versions are accessible in the SDK + + + + + +You can access all published versions of your dataset by opening the version history modal. This allows you to: +- Compare different versions of your dataset +- Track changes over time +- Switch between versions + + + \ No newline at end of file diff --git a/datasets/sdk-usage.mdx b/datasets/sdk-usage.mdx new file mode 100644 index 0000000..fff3f86 --- /dev/null +++ b/datasets/sdk-usage.mdx @@ -0,0 +1,226 @@ +--- +title: "SDK usage" +description: "Access your managed datasets with the Traceloop SDK" +--- + +## SDK Initialization + +First, initialize the Traceloop SDK. + + + +```python Python +from traceloop.sdk import Traceloop + +# Initialize with dataset sync enabled +client = Traceloop.init() +``` + +```js Typescript +import * as traceloop from "@traceloop/node-server-sdk"; + +// Initialize with comprehensive configuration +traceloop.initialize({ + appName: "your-app-name", + apiKey: process.env.TRACELOOP_API_KEY, + disableBatch: true, + traceloopSyncEnabled: true, +}); + +// Wait for initialization to complete +await traceloop.waitForInitialization(); + +// Get the client instance for dataset operations +const client = traceloop.getClient(); +``` + + + + + Make sure you've created an API key and set it as an environment variable + `TRACELOOP_API_KEY` before you start. Check out the SDK's [getting started + guide](/openllmetry/getting-started-python) for more information. + + +The SDK fetches your datasets from Traceloop servers. Changes made to a draft dataset version are immediately available in the UI. + +## Dataset Operations + +### Create a dataset + +You can create datasets in different ways depending on your data source: +- **Python**: Import from CSV file or pandas DataFrame +- **TypeScript**: Import from CSV data or create manually + + + +```python Python +import pandas as pd +from traceloop.sdk import Traceloop + +client = Traceloop.init() + +# Create dataset from CSV file +dataset_csv = client.datasets.from_csv( + file_path="path/to/your/data.csv", + slug="medical-questions", + name="Medical Questions", + description="Dataset with patients medical questions" +) + +# Create dataset from pandas DataFrame +data = { + "product": ["Laptop", "Mouse", "Keyboard", "Monitor"], + "price": [999.99, 29.99, 79.99, 299.99], + "in_stock": [True, True, False, True], + "category": ["Electronics", "Accessories", "Accessories", "Electronics"], +} +df = pd.DataFrame(data) + +# Create dataset from DataFrame +dataset_df = client.datasets.from_dataframe( + df=df, + slug="product-inventory", + name="Product Inventory", + description="Sample product inventory data", +) +``` + +```js Typescript +const client = traceloop.getClient(); + +// Option 1: Create dataset manually +const myDataset = await client.datasets.create({ + name: "Medical Questions", + slug: "medical-questions", + description: "Dataset with patients medical questions" +}); + +// Option 2: Create and import from CSV data +const csvData = `user_id,prompt,response,model,satisfaction_score +user_001,"What is React?","React is a JavaScript library...","gpt-3.5-turbo",4 +user_002,"Explain Docker","Docker is a containerization platform...","gpt-3.5-turbo",5`; + +await myDataset.fromCSV(csvData, { hasHeader: true }); +``` + + + +### Get a dataset +The dataset can be retrieved using its slug, which is available on the dataset page in the UI + + +```python Python +# Get dataset by slug - current draft version +my_dataset = client.datasets.get_by_slug("medical-questions") + +# Get specific version as CSV +dataset_csv = client.datasets.get_version_csv( + slug="medical-questions", + version="v2" +) +``` + +```js Typescript +// Get dataset by slug - current draft version +const myDataset = await client.datasets.get("medical-questions"); + +// Get specific version as CSV +const datasetCsv = await client.datasets.getVersionCSV("medical-questions", "v1"); + +``` + + + +### Adding a Column + + + +```python Python +from traceloop.sdk.dataset import ColumnType + +# Add a new column to your dataset +new_column = my_dataset.add_column( + slug="confidence_score", + name="Confidence Score", + col_type=ColumnType.NUMBER +) +``` + +```js Typescript +// Define schema by adding multiple columns +const columnsToAdd = [ + { + name: "User ID", + slug: "user-id", + type: "string" as const, + description: "Unique identifier for the user" + }, + { + name: "Satisfaction score", + slug: "satisfaction-score", + type: "number" as const, + description: "User satisfaction rating (1-5)" + } +]; + +await myDataset.addColumn(columnsToAdd); +console.log("Schema defined with multiple columns"); +``` + + + +### Adding Rows + +Map the column slug to its relevant value + + +```python Python +# Add new rows to your dataset +row_data = { + "product": "TV Screen", + "price": 1500.0, + "in_stock": True, + "category": "Electronics" +} + +my_dataset.add_rows([row_data]) +``` + +```js Typescript +// Add individual rows to dataset +const userId = "user_001"; +const prompt = "Explain machine learning in simple terms"; +const startTime = Date.now(); + +const rowData = { + user_id: userId, + prompt: prompt, + response: `This is the model response`, + model: "gpt-3.5-turbo", + satisfaction_score: 1, +}; + +await myDataset.addRow(rowData); +``` + + + +## Dataset Versions + +### Publish a dataset +Dataset versions and history can be viewed in the UI. Versioning allows you to run the same evaluations and experiments across different datasets, making valuable comparisons possible. + + +```python Python +# Publish the current dataset state as a new version +published_version = my_dataset.publish() +``` + +```js Typescript +// Publish dataset with version and description +const publishedVersion = await myDataset.publish(); +``` + + + diff --git a/experiments/introduction.mdx b/experiments/introduction.mdx new file mode 100644 index 0000000..8a890e4 --- /dev/null +++ b/experiments/introduction.mdx @@ -0,0 +1,31 @@ +--- +title: "Introduction" +--- + +Building reliable LLM applications means knowing whether a new prompt, model, or change of flow actually makes things better. + + + + + + +Experiments in Traceloop provide teams with a structured workflow for testing and comparing results across different prompt, model, and evaluator checks, all against real datasets. +## What You Can Do with Experiments + + + + Execute multiple evaluation checks against your dataset + + + See all experiment run outputs in a comprehensive table view with relevant indicators and detailed reasoning + + + Run the same experiment across different dataset versions to see how it affects your workflow + + + Add a tailored task to the experiment to create evaluator input. For example: LLM calls, semantic search, etc. + + diff --git a/experiments/result-overview.mdx b/experiments/result-overview.mdx new file mode 100644 index 0000000..09afae1 --- /dev/null +++ b/experiments/result-overview.mdx @@ -0,0 +1,44 @@ +--- +title: "Result Overview" +--- + +All experiments are logged in the Traceloop platform. Each experiment is executed through the SDK. + + + + + +## Experiment Runs +An experiment can be run multiple times against different datasets and tasks. All runs are logged in the Traceloop platform to enable easy comparison. + + + + + + +## Experiment Tasks + +An experiment run is made up of multiple tasks, where each task represents the experiment flow applied to a single dataset row. + +The task logging captures: + +- Task input – the data taken from the dataset row. + +- Task outputs – the results produced by running the task, which are then passed as input to the evaluator. + +- Evaluator results – the evaluator’s assessment based on the task outputs. + + + + + + diff --git a/experiments/running-from-code.mdx b/experiments/running-from-code.mdx index b833d9d..01ae5bb 100644 --- a/experiments/running-from-code.mdx +++ b/experiments/running-from-code.mdx @@ -1,235 +1,346 @@ --- -title: "Running Experiments from Code" +title: "Run via SDK" description: "Learn how to run experiments programmatically using the Traceloop SDK" --- You can run experiments programmatically using the Traceloop SDK. This allows you to systematically evaluate different AI model configurations, prompts, and approaches with your datasets. -## Setup +## SDK Initialization -First, initialize the Traceloop client in your code: +First, initialize the Traceloop SDK. -```python + + +```python Python from traceloop.sdk import Traceloop -# Initialize Traceloop -Traceloop.init() -client = Traceloop.client() +# Initialize with dataset sync enabled +client = Traceloop.init() +``` + +```js Typescript +import * as traceloop from "@traceloop/node-server-sdk"; + +// Initialize with comprehensive configuration +traceloop.initialize({ + appName: "your-app-name", + apiKey: process.env.TRACELOOP_API_KEY, + disableBatch: true, + traceloopSyncEnabled: true, +}); + +// Wait for initialization to complete +await traceloop.waitForInitialization(); + +// Get the client instance for dataset operations +const client = traceloop.getClient(); ``` + + + + Make sure you've created an API key and set it as an environment variable + `TRACELOOP_API_KEY` before you start. Check out the SDK's [getting started + guide](/openllmetry/getting-started-python) for more information. + + + ## Basic Experiment Structure An experiment consists of: - A **dataset** to test against - A **task function** that defines what your AI system should do -- **evaluators** to measure performance -- An **experiment slug** to identify the experiment +- **Evaluators** to measure performance ## Task Functions -Create task functions that define how your AI system processes each dataset item: +Create a task function that define how your AI system processes each dataset row. The task is one of the experiments parameters, it will run it on each dataset row. -```python -async def my_task_function(input_data): - # Your AI processing logic here - # This could involve calling OpenAI, Anthropic, etc. - - response = await openai.ChatCompletion.acreate( - model="gpt-4", - messages=[ - {"role": "system", "content": "You are a helpful assistant."}, - {"role": "user", "content": input_data["question"]} - ] - ) - - return { - "response": response.choices[0].message.content, - "model": "gpt-4" - } +The task function signature expects: +- **Input**: An optional dictionary containing the dataset row data +- **Output**: A dictionary with your task results + + + +```python Python +task: Callable[[Optional[Dict[str, Any]]], Dict[str, Any]] ``` -## Running Experiments +```typescript TypeScript +type dataValue = string | number | boolean; +type TaskInput = Record; +type TaskOutput = Record; +interface ExperimentTaskFunction { + (input: TInput): Promise | TOutput; +} +``` -Use the `experiment.run()` method to execute your experiment: + -```python -async def run_my_experiment(): - results, errors = await client.experiment.run( - dataset_slug="my-dataset", - dataset_version="v1", - task=my_task_function, - evaluators=["accuracy", "relevance"], - experiment_slug="my-experiment-v1" +A task can perform any desired internal operation—such as LLM calls, semantic search, RAG, or custom business logic. The task output is then used as the evaluator input and is automatically logged in the Traceloop platform. + + Ensure that the evaluator input schema variables are included in the task output dictionary. You can add extra attributes to the task output even if they are not evaluator input parameters—these will also be logged to the platform. + + + +```python Python +from openai import AsyncOpenAI +import os + +def provide_medical_info_prompt(question: str) -> str: + """ + Provides comprehensive medical information without restrictions + """ + return f"""You are a knowledgeable health educator. + Please provide a comprehensive, detailed answer to the following health question. + + Question: {question} + + Please provide: + 1. A clear, factual explanation using accessible language + 2. Key benefits and important considerations + 3. Specific recommendations and actionable guidance + 4. Relevant details about treatments, symptoms, or health practices + 5. Any relevant medical or scientific context + + Be thorough and informative in your response.""" + +async def medical_task(row): + openai_client = AsyncOpenAI(api_key=os.getenv("OPENAI_API_KEY")) + + prompt_text = provide_medical_info_prompt(row["question"]) + response = await openai_client.chat.completions.create( + model="gpt-3.5-turbo", + messages=[{"role": "user", "content": prompt_text}], + temperature=0.7, + max_tokens=500, ) - - print(f"Experiment completed with {len(results)} results and {len(errors)} errors") - return results, errors + + ai_response = response.choices[0].message.content + + return {"completion": ai_response, "text": ai_response} ``` -## Comparing Different Approaches +```typescript TypeScript +import { OpenAI } from "openai"; +import type { + ExperimentTaskFunction, + TaskInput, + TaskOutput, +} from "@traceloop/node-server-sdk"; + +function provideMedicalInfoPrompt(question: string): string { + return `You are a health educator providing comprehensive medical information. + + Question: ${question} + + Please provide a detailed, educational response that includes: + + 1. **Clear, factual explanation** of the medical concept or condition + 2. **Key benefits and considerations** related to the topic + 3. **Specific recommendations** based on current medical knowledge + 4. **Important disclaimers** about consulting healthcare professionals + 5. **Relevant context** that helps understand the topic better + + Guidelines: + - Use evidence-based information + - Explain medical terms in plain language + - Include both benefits and risks when applicable + - Emphasize the importance of professional medical consultation + - Provide actionable, general health guidance + + Your response should be educational, balanced, and encourage informed healthcare decisions.`; +} + + +/** + * Task function for medical advice prompt + */ +const medicalTask: ExperimentTaskFunction = async ( + row: TaskInput, +): Promise => { + const openai = new OpenAI({ + apiKey: process.env.OPENAI_API_KEY, + }); + + const promptText = provideMedicalInfoPrompt(row.question as string); + const answer = await openai.chat.completions.create({ + model: "gpt-3.5-turbo", + messages: [{ role: "user", content: promptText }], + temperature: 0.7, + max_tokens: 500, + }); + + const aiResponse = answer.choices?.[0]?.message?.content + return { completion: aiResponse, text: aiResponse }; +}; +``` -You can run multiple experiments to compare different approaches: - -```python -# Task function with conservative prompting -async def conservative_task(input_data): - response = await openai.ChatCompletion.acreate( - model="gpt-4", - messages=[ - {"role": "system", "content": "Be very careful and conservative in your response."}, - {"role": "user", "content": input_data["question"]} - ] - ) - return {"response": response.choices[0].message.content} - -# Task function with creative prompting -async def creative_task(input_data): - response = await openai.ChatCompletion.acreate( - model="gpt-4", - messages=[ - {"role": "system", "content": "Be creative and think outside the box."}, - {"role": "user", "content": input_data["question"]} - ] - ) - return {"response": response.choices[0].message.content} + -# Run both experiments -async def compare_approaches(): - # Conservative approach - conservative_results, _ = await client.experiment.run( - dataset_slug="my-dataset", - dataset_version="v1", - task=conservative_task, - evaluators=["accuracy"], - experiment_slug="conservative-approach" - ) - - # Creative approach - creative_results, _ = await client.experiment.run( - dataset_slug="my-dataset", - dataset_version="v1", - task=creative_task, - evaluators=["accuracy"], - experiment_slug="creative-approach" - ) - - return conservative_results, creative_results +## Running Experiments + +Use the `experiment.run()` method to execute your experiment by selecting a dataset as the source data, choosing the evaluators to run, and assigning a slug to make it easy to rerun later. + +#### `experiment.run()` Parameters + +- `dataset_slug` (str): Identifier for your dataset +- `dataset_version` (str): Version of the dataset to use, experiment can only run on a published version +- `task` (function): Async function that processes each dataset row +- `evaluators` (list): List of evaluator slugs to measure performance +- `experiment_slug` (str): Unique identifier for this experiment +- `stop_on_error` (boolean): Whether to stop on first error (default: False) +- `wait_for_results` (boolean): Whether to wait for async tasks to complete, when not waiting the results will be found in the ui (default: True) + + + +```python Python +results, errors = await client.experiment.run( + dataset_slug="medical-q", + dataset_version="v1", + task=medical_task, + evaluators=["medical_advice", "response-counter"], + experiment_slug="medical-advice-exp", + stop_on_error=False, +) +``` + +```typescript TypeScript +const results = await client.experiment.run(medicalTask, { + datasetSlug: "medical-q", + datasetVersion: "v1", + evaluators: ["medical_advice", "response-counter"], + experimentSlug: "medical-advice-exp-ts", + stopOnError: false, +}); ``` -## Complete Example + + +## Comparing Different Approaches -Here's a full example that tests different email generation strategies for customer support: +You can run multiple experiments to compare different approaches—whether by using different datasets, trying alternative task functionality, or testing variations in prompts, models, or business logic. -```python -import asyncio -from traceloop.sdk import Traceloop -import openai - -# Initialize Traceloop -Traceloop.init() -client = Traceloop.client() - -async def generate_support_email(customer_issue, tone="professional"): - tone_prompts = { - "professional": "You are a professional customer support agent. Write clear, formal responses that solve the customer's issue.", - "friendly": "You are a friendly customer support agent. Write warm, conversational responses that make the customer feel valued.", - "concise": "You are an efficient customer support agent. Write brief, direct responses that quickly address the customer's issue." - } + + +```python Python +# Task function that provides comprehensive medical information +async def medical_task_provide_info(row): + openai_client = AsyncOpenAI(api_key=os.getenv("OPENAI_API_KEY")) - response = await openai.ChatCompletion.acreate( - model="gpt-4", - messages=[ - {"role": "system", "content": tone_prompts[tone]}, - {"role": "user", "content": f"Customer issue: {customer_issue}"} - ] + prompt_text = provide_medical_info_prompt(row["question"]) + response = await openai_client.chat.completions.create( + model="gpt-3.5-turbo", + messages=[{"role": "user", "content": prompt_text}], + temperature=0.7, + max_tokens=500, ) - return response.choices[0].message.content - -# Task function for professional tone -async def professional_support_task(input_data): - email = await generate_support_email(input_data["issue"], tone="professional") - return { - "email_response": email, - "tone": "professional" - } - -# Task function for friendly tone -async def friendly_support_task(input_data): - email = await generate_support_email(input_data["issue"], tone="friendly") - return { - "email_response": email, - "tone": "friendly" - } - -# Task function for concise tone -async def concise_support_task(input_data): - email = await generate_support_email(input_data["issue"], tone="concise") - return { - "email_response": email, - "tone": "concise" - } - -async def run_support_experiment(): - dataset_config = { - "dataset_slug": "customer-support-issues", - "dataset_version": "v2", - "evaluators": ["helpfulness", "clarity", "customer_satisfaction"] - } + ai_response = response.choices[0].message.content + return {"completion": ai_response, "text": ai_response} + +# Task function that refuses to provide medical advice +async def medical_task_refuse_advice(row): + openai_client = AsyncOpenAI(api_key=os.getenv("OPENAI_API_KEY")) - # Test professional tone - professional_results, prof_errors = await client.experiment.run( - **dataset_config, - task=professional_support_task, - experiment_slug="support-professional-tone" + prompt_text = f"You must refuse to provide medical advice. Question: {row['question']}" + response = await openai_client.chat.completions.create( + model="gpt-3.5-turbo", + messages=[{"role": "user", "content": prompt_text}], + temperature=0.7, + max_tokens=500, ) - # Test friendly tone - friendly_results, friendly_errors = await client.experiment.run( - **dataset_config, - task=friendly_support_task, - experiment_slug="support-friendly-tone" + ai_response = response.choices[0].message.content + return {"completion": ai_response, "text": ai_response} + +# Run both approches in the same experiment +async def compare_medical_approaches(): + # Provide info approach + provide_results, provide_errors = await client.experiment.run( + dataset_slug="medical-q", + dataset_version="v1", + task=medical_task_provide_info, + evaluators=["medical_advice", "response-counter"], + experiment_slug="medical-info", ) - # Test concise tone - concise_results, concise_errors = await client.experiment.run( - **dataset_config, - task=concise_support_task, - experiment_slug="support-concise-tone" + # Refuse advice approach + refuse_results, refuse_errors = await client.experiment.run( + dataset_slug="medical-q", + dataset_version="v1", + task=medical_task_refuse_advice, + evaluators=["medical_advice", "response-counter"], + experiment_slug="medical-info", ) - print(f"Professional tone: {len(professional_results)} results, {len(prof_errors)} errors") - print(f"Friendly tone: {len(friendly_results)} results, {len(friendly_errors)} errors") - print(f"Concise tone: {len(concise_results)} results, {len(concise_errors)} errors") - - return professional_results, friendly_results, concise_results - -if __name__ == "__main__": - asyncio.run(run_support_experiment()) + return provide_results, refuse_results ``` -## Parameters - -### `experiment.run()` Parameters - -- `dataset_slug` (str): Identifier for your dataset -- `dataset_version` (str): Version of the dataset to use -- `task` (function): Async function that processes each dataset item -- `evaluators` (list): List of evaluator names to measure performance -- `experiment_slug` (str): Unique identifier for this experiment +```typescript TypeScript +// Task function that provides comprehensive medical information +const medicalTaskProvideInfo: ExperimentTaskFunction = async ( + row: TaskInput, +): Promise => { + const promptText = provideMedicalInfoPrompt(row.question as string); + const answer = await openai.chat.completions.create({ + model: "gpt-3.5-turbo", + messages: [{ role: "user", content: promptText }], + temperature: 0.7, + max_tokens: 500, + }); + + const aiResponse = answer.choices?.[0]?.message?.content || ""; + return { completion: aiResponse, text: aiResponse }; +}; + +// Task function that refuses to provide medical advice +const medicalTaskRefuseAdvice: ExperimentTaskFunction = async ( + row: TaskInput, +): Promise => { + const promptText = `You must refuse to provide medical advice. Question: ${row.question}`; + const answer = await openai.chat.completions.create({ + model: "gpt-3.5-turbo", + messages: [{ role: "user", content: promptText }], + temperature: 0.7, + max_tokens: 500, + }); + + const aiResponse = answer.choices?.[0]?.message?.content || ""; + return { completion: aiResponse, text: aiResponse }; +}; + +// Run both approches in the same experiment +async function compareMedicalApproaches() { + // Provide info approach + const provideResults = await client.experiment.run(medicalTaskProvideInfo, { + datasetSlug: "medical-q", + datasetVersion: "v1", + evaluators: ["medical_advice", "response-counter"], + experimentSlug: "medical-info", + }); + + // Refuse advice approach + const refuseResults = await client.experiment.run(medicalTaskRefuseAdvice, { + datasetSlug: "medical-q", + datasetVersion: "v1", + evaluators: ["medical_advice", "response-counter"], + experimentSlug: "medical-info", + }); + + return [provideResults, refuseResults]; +} +``` -### Task Function Requirements + -Your task function should: -- Be async (`async def`) -- Accept one parameter (the input data from your dataset) -- Return a dictionary with your results -- Handle errors gracefully +## Full Examples -## Best Practices +For complete, working examples that you can run and modify: -1. **Use descriptive experiment slugs** to easily identify different runs -2. **Version your datasets** to ensure reproducible results -3. **Handle errors** in your task functions to avoid experiment failures -4. **Use appropriate evaluators** that match your use case -5. **Compare multiple approaches** systematically to find the best solution \ No newline at end of file + + + + + + diff --git a/img/dataset/dataset-list-dark.png b/img/dataset/dataset-list-dark.png new file mode 100644 index 0000000..dfba8b7 Binary files /dev/null and b/img/dataset/dataset-list-dark.png differ diff --git a/img/dataset/dataset-list-light.png b/img/dataset/dataset-list-light.png new file mode 100644 index 0000000..73d62a9 Binary files /dev/null and b/img/dataset/dataset-list-light.png differ diff --git a/img/dataset/dataset-view-dark.png b/img/dataset/dataset-view-dark.png new file mode 100644 index 0000000..9b3f4a2 Binary files /dev/null and b/img/dataset/dataset-view-dark.png differ diff --git a/img/dataset/dataset-view-light.png b/img/dataset/dataset-view-light.png new file mode 100644 index 0000000..1a6dd34 Binary files /dev/null and b/img/dataset/dataset-view-light.png differ diff --git a/img/experiment/exp-list-dark.png b/img/experiment/exp-list-dark.png new file mode 100644 index 0000000..655bec9 Binary files /dev/null and b/img/experiment/exp-list-dark.png differ diff --git a/img/experiment/exp-list-light.png b/img/experiment/exp-list-light.png new file mode 100644 index 0000000..e05f08d Binary files /dev/null and b/img/experiment/exp-list-light.png differ diff --git a/img/experiment/exp-run-dark.png b/img/experiment/exp-run-dark.png new file mode 100644 index 0000000..ffd71b7 Binary files /dev/null and b/img/experiment/exp-run-dark.png differ diff --git a/img/experiment/exp-run-light.png b/img/experiment/exp-run-light.png new file mode 100644 index 0000000..cee3b8a Binary files /dev/null and b/img/experiment/exp-run-light.png differ diff --git a/img/experiment/exp-run-list-dark.png b/img/experiment/exp-run-list-dark.png new file mode 100644 index 0000000..9070c0d Binary files /dev/null and b/img/experiment/exp-run-list-dark.png differ diff --git a/img/experiment/exp-run-list-light.png b/img/experiment/exp-run-list-light.png new file mode 100644 index 0000000..049df66 Binary files /dev/null and b/img/experiment/exp-run-list-light.png differ diff --git a/mint.json b/mint.json index e18209e..7992629 100644 --- a/mint.json +++ b/mint.json @@ -143,10 +143,6 @@ "group": "Quick Start", "pages": ["hub/getting-started", "hub/configuration"] }, - { - "group": "Experiments", - "pages": ["experiments/running-from-code"] - }, { "group": "Monitoring", "pages": ["monitoring/introduction"] @@ -155,6 +151,14 @@ "group": "Prompt Management", "pages": ["prompts/quick-start", "prompts/registry", "prompts/sdk-usage"] }, + { + "group": "Datasets", + "pages": ["datasets/quick-start", "datasets/sdk-usage"] + }, + { + "group": "Experiments", + "pages": ["experiments/introduction", "experiments/result-overview", "experiments/running-from-code"] + }, { "group": "Integrations", "pages": ["integrations/posthog"]