|
|
|
|
|
import pandas as pd |
|
|
import json |
|
|
import os |
|
|
|
|
|
|
|
|
input_csv_path = "data/medquad.csv" |
|
|
output_json_path = "data/medquad_cleaned.json" |
|
|
|
|
|
|
|
|
os.makedirs(os.path.dirname(output_json_path), exist_ok=True) |
|
|
|
|
|
|
|
|
df = pd.read_csv(input_csv_path) |
|
|
|
|
|
|
|
|
df.dropna(subset=["question", "answer"], inplace=True) |
|
|
df["question"] = df["question"].str.strip() |
|
|
df["answer"] = df["answer"].str.strip() |
|
|
df["source"] = df["source"].fillna("").str.strip() |
|
|
df.drop_duplicates(subset=["question", "answer"], inplace=True) |
|
|
|
|
|
|
|
|
cleaned_data = [ |
|
|
{ |
|
|
"title": row["question"], |
|
|
"context": row["answer"], |
|
|
"source": row["source"] |
|
|
} |
|
|
for _, row in df.iterrows() |
|
|
] |
|
|
|
|
|
|
|
|
with open(output_json_path, "w", encoding="utf-8") as f: |
|
|
json.dump(cleaned_data, f, indent=2) |
|
|
|
|
|
print(f"β
Cleaned data saved to: {output_json_path}") |
|
|
|