File size: 930 Bytes
7dc8b43
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38

import pandas as pd
import json
import os

# Input and output paths
input_csv_path = "data/medquad.csv"
output_json_path = "data/medquad_cleaned.json"

# Make sure output directory exists
os.makedirs(os.path.dirname(output_json_path), exist_ok=True)

# Load CSV
df = pd.read_csv(input_csv_path)

# Basic cleaning
df.dropna(subset=["question", "answer"], inplace=True)
df["question"] = df["question"].str.strip()
df["answer"] = df["answer"].str.strip()
df["source"] = df["source"].fillna("").str.strip()
df.drop_duplicates(subset=["question", "answer"], inplace=True)

# Convert to list of dicts
cleaned_data = [
    {
        "title": row["question"],
        "context": row["answer"],
        "source": row["source"]
    }
    for _, row in df.iterrows()
]

# Save as JSON
with open(output_json_path, "w", encoding="utf-8") as f:
    json.dump(cleaned_data, f, indent=2)

print(f"✅ Cleaned data saved to: {output_json_path}")