import json
import pandas as pd
import pyarrow.parquet as pq
读json,json,parquet文件
defread_json_file(file_path):try:withopen(file_path,'r', encoding='utf-8')asfile:
data = json.load(file)return data
except FileNotFoundError:print(f"File {file_path} not found.")except json.JSONDecodeError:print(f"File {file_path} is not a valid JSON file.")except Exception as e:print(f"An error occurred: {e}")defread_jsonl_file(file_path):
data =[]withopen(file_path,'r', encoding='utf-8')asfile:for line infile:try:
data.append(json.loads(line))except:print(line)1/0return data
defread_praquet_file(file_path):
table = pq.read_table(file_path)
df = table.to_pandas()
result=[row for _, row in df.iterrows()]return result
写json,json,parquet文件
defsave_json(file_path,data):withopen(file_path,'w', encoding='utf-8')asfile:
json.dump(data,file, indent=4, ensure_ascii=False)print(f'Save {file_path} is ok!')defsave_jsonl(file_path,data):try:withopen(file_path,'w', encoding='utf-8')asfile:for item in data:file.write(json.dumps(item, ensure_ascii=False)+'\n')print(f"Data saved to {file_path}")except Exception as e:print(f"An error occurred while saving the data: {e}")defsave_parquet(file_path, data):ifisinstance(data,list):
data = pd.DataFrame(data)ifnotisinstance(data, pd.DataFrame):raise ValueError("data must be a pandas DataFrame or a list of lists")
pq.write_table(pa.Table.from_pandas(data), file_path)print(f'Save {file_path} is ok!')# 写parquet的示例数据
data ={'col1':[1,2,3],'col2':['a','b','c']}
df = pd.DataFrame(data)# 保存数据到 Parquet 文件
save_parquet('output.parquet', df)