去hf-mirror.org下载 SFT格式数据。json
转换方法:
import json
class sft_data():
def __init__(self,file_name,save_name):
f=open(file_name,"r",encoding="utf-8")
self.data=json.loads(f.read())
f.close()
new_file=open(save_name,"w",encoding="utf-8")
for i in self.data:
one_text=self.mlx_train_text(i)
new_file.write(json.dumps(one_text, ensure_ascii=False)+"\n")
new_file.close()
def mlx_train_text(self,one_dic):
# Question=one_dic['Question']
# Complex_CoT=one_dic['Complex_CoT']
# Response=one_dic['Response']
otherx=list(one_dic.values())[0:3]
Question=otherx[0]
Complex_CoT=otherx[1]
Response=otherx[2]
text="Please reason step by step:\n\nQuestion:"+Question+"\n\nLet's solve this step by step:\n"+Complex_CoT+"\n\nFinal Answer:"+Response
return {"text":text}
sft_data("medical_o1_sft_Chinese.json","mlx_sft2.jsonl")
import json
class sft_data():
def __init__(self,file_name,save_name):
f=open(file_name,"r",encoding="utf-8")
self.data=json.loads(f.read())
f.close()
new_file=open(save_name,"w")
for i in self.data:
one_text=self.mlx_train_text(i)
new_file.write(json.dumps(one_text)+"\n")
new_file.close()
def mlx_train_text(self,one_dic):
Question=one_dic['Question']
Response=one_dic['Response']
Complex_CoT=one_dic['Complex_CoT']
text="Please reason step by step:\n\nQuestion:"+Question+"\n\nLet's solve this step by step:\n"+Complex_CoT+"\n\nFinal Answer:"+Response
return {"text":text}
sft_data("medical_o1_sft_Chinese.json","mlx_sft.jsonl")
MLX 的SFT数据格式
Please reason step by step:
#空行
Question:Question
#空行
Let's solve this step by step:
Complex_CoT
#空行
Final Answer:Response
以JOSNL 的格式
{"text":上面的sft信息}
{"text":上面的sft信息}
{"text":上面的sft信息}
{"text":上面的sft信息}