Spaces:

Mahmous
/

chatbot3

Running

chatbot3 / data /clean_medquad.py

Mahmoud Sayed

First

7dc8b43 about 2 months ago

930 Bytes


	import pandas as pd
	import json
	import os

	# Input and output paths
	input_csv_path = "data/medquad.csv"
	output_json_path = "data/medquad_cleaned.json"

	# Make sure output directory exists
	os.makedirs(os.path.dirname(output_json_path), exist_ok=True)

	# Load CSV
	df = pd.read_csv(input_csv_path)

	# Basic cleaning
	df.dropna(subset=["question", "answer"], inplace=True)
	df["question"] = df["question"].str.strip()
	df["answer"] = df["answer"].str.strip()
	df["source"] = df["source"].fillna("").str.strip()
	df.drop_duplicates(subset=["question", "answer"], inplace=True)

	# Convert to list of dicts
	cleaned_data = [
	{
	"title": row["question"],
	"context": row["answer"],
	"source": row["source"]
	}
	for _, row in df.iterrows()
	]

	# Save as JSON
	with open(output_json_path, "w", encoding="utf-8") as f:
	json.dump(cleaned_data, f, indent=2)

	print(f"✅ Cleaned data saved to: {output_json_path}")