gdb
diff --git a/‎openai/tests/test_long_examples_validator.py
Lines changed: 48 additions & 0 deletions b/‎openai/tests/test_long_examples_validator.py
Lines changed: 48 additions & 0 deletions
diff --git a/‎openai/validators.py
Lines changed: 12 additions & 5 deletions b/‎openai/validators.py
Lines changed: 12 additions & 5 deletions
@@ -0,0 +1,48 @@
+import json
+import subprocess
+from tempfile import NamedTemporaryFile
+
+
+def test_long_examples_validator() -> None:
+
+    """
+    Ensures that long_examples_validator() handles previously applied recommendations,
+    namely dropped duplicates, without resulting in a KeyError.
+    """
+
+    # data
+    short_prompt = "a prompt "
+    long_prompt = short_prompt * 500
+
+    short_completion = "a completion "
+    long_completion = short_completion * 500
+
+    # the order of these matters
+    unprepared_training_data = [
+        {"prompt": long_prompt, "completion": long_completion},  # 1 of 2 duplicates
+        {"prompt": short_prompt, "completion": short_completion}, 
+        {"prompt": long_prompt, "completion": long_completion},  # 2 of 2 duplicates
+
+    ]
+
+    with NamedTemporaryFile(suffix="jsonl", mode="w") as training_data:
+        for prompt_completion_row in unprepared_training_data:
+            training_data.write(json.dumps(prompt_completion_row) + "\n")
+            training_data.flush()
+    
+        prepared_data_cmd_output = subprocess.run(
+            [f"openai tools fine_tunes.prepare_data -f {training_data.name}"], 
+            stdout=subprocess.PIPE, 
+            text=True, 
+            input="y\ny\ny\ny\ny",  # apply all recommendations, one at a time
+            stderr=subprocess.PIPE,
+            encoding="utf-8",
+            shell=True
+        )
+
+    # validate data was prepared successfully
+    assert prepared_data_cmd_output.stderr == ""  
+    # validate get_long_indexes() applied during optional_fn() call in long_examples_validator()
+    assert "indices of the long examples has changed" in prepared_data_cmd_output.stdout
+    
+    return prepared_data_cmd_output.stdout
@@ -158,17 +158,24 @@ def long_examples_validator(df):
 
     ft_type = infer_task_type(df)
     if ft_type != "open-ended generation":
-        long_examples = df.apply(
-            lambda x: len(x.prompt) + len(x.completion) > 10000, axis=1
-        )
-        long_indexes = df.reset_index().index[long_examples].tolist()
+        def get_long_indexes(d):
+            long_examples = d.apply(
+                lambda x: len(x.prompt) + len(x.completion) > 10000, axis=1
+            )
+            return d.reset_index().index[long_examples].tolist()
+
+        long_indexes = get_long_indexes(df)
 
         if len(long_indexes) > 0:
             immediate_msg = f"\n- There are {len(long_indexes)} examples that are very long. These are rows: {long_indexes}\nFor conditional generation, and for classification the examples shouldn't be longer than 2048 tokens."
             optional_msg = f"Remove {len(long_indexes)} long examples"
 
             def optional_fn(x):
-                return x.drop(long_indexes)
+                
+                long_indexes_to_drop = get_long_indexes(x)
+                if long_indexes != long_indexes_to_drop:
+                    sys.stdout.write(f"The indices of the long examples has changed as a result of a previously applied recommendation.\nThe {len(long_indexes_to_drop)} long examples to be dropped are now at the following indices: {long_indexes_to_drop}\n")
+                return x.drop(long_indexes_to_drop)
 
     return Remediation(
         name="long_examples",