123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440 |
- import pandas as pd
- import pyreadstat
- import numpy as np
- from savReaderWriter import SavReader
- def sav2csv(sav_file_path, csv_file_path):
- #读取sav数据
- with SavReader(sav_file_path) as reader:
- header = reader.header
- data = reader.all()
- #转化为Dataframe
- df = pd.DataFrame(data, columns=header)
- # 将列名从字节串转换为字符串
- df.columns = [col.decode('utf-8') if isinstance(col, bytes) else col for col in df.columns]
- # 另存为csv数据
- df.to_csv(csv_file_path, index=False)
- #进行adl转换
- def trans_adl(columns_adl, data, result, adl_name):
- # 定义转换规则
- transformation = {1: 0, 2: 1, 3: 2}
- for column in columns_adl:
- data[column] = data[column].map(transformation).fillna(np.nan) # 将其他转换为缺失值
- # 计算转换后列的总和并创建一个新列
- data[adl_name] = data[columns_adl].sum(axis=1)
- result[adl_name] = data[adl_name].apply(lambda x : 0 if x==0 else (1 if x>0 and x<=12 else np.nan))
- #进行mmse转换
- def trans_mmse(columns_mmse, data):
- # 定义转换规则
- transformation = {0: 0, 1: 1}
- for column in columns_mmse:
- data[column] = data[column].map(transformation).fillna(np.nan) # 将其他转换为缺失值
- #进行mmsec16转换
- def trans_mmse_c16(columns_mmse, data):
- # 定义转换规则
- for column in columns_mmse:
- data[column] = data[column].apply(lambda x : np.nan if x==88 or x==99 or x==-9 or x==-8 or x==-7 or x==-6 else 1) # 将其他转换为缺失值
- def get_mmse(columns_cognitive_98,columns_reaction_98,columns_attention_98, columns_memory_98 ,columns_language_98, data, result, cognitive_name):
- # 计算一般能力
- result["general_cognitive_"+cognitive_name] = data[columns_cognitive_98].sum(axis=1) *2
- # 计算反应能力
- result["reaction_"+cognitive_name] = data[columns_reaction_98].sum(axis=1)
- # 计算注意力与计算力
- result["attention_calculation_"+cognitive_name] = data[columns_attention_98].sum(axis=1)
- # 计算回忆力
- result["memory_"+cognitive_name] = data[columns_memory_98].sum(axis=1)
- # 计算语言能力和自我协调
- result["language_selfcoordination_"+cognitive_name] = data[columns_language_98].sum(axis=1)
- # 计算总合
- result['mmse_'+cognitive_name] = result["general_cognitive_"+cognitive_name] + result["reaction_"+cognitive_name]+ result["attention_calculation_"+cognitive_name]+ result["memory_"+cognitive_name]+ result["language_selfcoordination_"+cognitive_name]
- def deal_1998_2018_data():
- sav_file_path = "CLHLS/clhls_1998_2018_longitudinal_dataset_released_version1.sav"
- csv_file_path = "CLHLS/clhls_1998_2018_longitudinal_dataset_released_version1.csv"
- # 将sav数据转为csv
- # sav2csv(sav_file_path, csv_file_path)
- #处理数据
- data = pd.read_csv(csv_file_path)
- # 存活状态0存活;1死亡;-9失访;-8死亡/失访
- result = data[['id', 'dth98_00','dth00_02', 'dth02_05', 'dth02_05', 'dth05_08', 'dth08_11', 'dth11_14', 'dth14_18']]
- # 人口特征学变量
- # 8/9代表无法回答和缺失
- # 年龄
- result[['trueage_98','trueage_00','trueage_02','trueage_05','trueage_08','trueage_11', 'trueage_14', 'trueage_18']] = data[['trueage','vage_0','vage_2', 'vage_5', 'vage_8','vage_11', 'trueage_14', 'trueage_18']]
- # 性别 1男;0女
- result['sex'] = data['a1'].apply(lambda x : 1 if x==1 else 0)
- # 民族 1汉族;0非汉族
- result['ethnic'] = data['a2'].apply(lambda x : 1 if x==1 else 0)
- # 出生地 1城市;0农村
- result['birth_place'] = data['a42'].apply(lambda x : 1 if x == 1 else (0 if x == 2 else np.nan))
- # 教育状况 无11年
- result['edu_98'] = data['f1'].apply(lambda x : np.nan if x==88 or x==99 else x)
- result['edu_08'] = data['f1_8'].apply(lambda x : np.nan if x==88 or x==99 else x)
- result['edu_14'] = data['f1_14'].apply(lambda x : np.nan if x==88 or x==99 else x)
- result['edu_18'] = data['f1_18'].apply(lambda x : np.nan if x==88 or x==99 else x)
- # 婚姻状况 0separated/divorced/widowed/never married; 1currently married and living with spouse
- result['marital_98'] = data['f41'].apply(lambda x : 0 if x==2 or x==3 or x==4 or x==5 else (np.nan if x==9 else 1))
- result['marital_00'] = data['f41_0'].apply(lambda x : 0 if x==2 or x==3 or x==4 or x==5 else (np.nan if x==9 or x==-9 or x==-6 else 1))
- result['marital_02'] = data['f41_2'].apply(lambda x : 0 if x==2 or x==3 or x==4 or x==5 else (np.nan if x==9 or x==-9 or x==-6 or x==-8 else 1))
- result['marital_05'] = data['f41_5'].apply(lambda x : 0 if x==2 or x==3 or x==4 or x==5 else (np.nan if x==9 or x==-9 or x==-6 or x==-8 else 1))
- result['marital_08'] = data['f41_8'].apply(lambda x : 0 if x==2 or x==3 or x==4 or x==5 else (np.nan if x==9 or x==-9 or x==-6 or x==-8 or x==-7 else 1))
- result['marital_08'] = data['f41_8'].apply(lambda x : 0 if x==2 or x==3 or x==4 or x==5 else (np.nan if x==9 or x==-9 or x==-8 or x==-7 else 1))
- result['marital_11'] = data['f41_11'].apply(lambda x : 0 if x==2 or x==3 or x==4 or x==5 else (np.nan if x==9 or x==-9 or x==-8 or x==-7 else 1))
- result['marital_14'] = data['f41_14'].apply(lambda x : 0 if x==2 or x==3 or x==4 or x==5 else (np.nan if x==9 else 1))
- result['marital_18'] = data['f41_18'].apply(lambda x : 0 if x==2 or x==3 or x==4 or x==5 else (np.nan if x==9 else 1))
- # 生活是否富裕 1富裕及以上;0一般及以下
- result['econ_state_00'] = data['f34_0'].apply(lambda x : 0 if x==2 or x==3 else (1 if x==1 else np.nan))
- result['econ_state_02'] = data['f34_2'].apply(lambda x : 0 if x==4 or x==3 or x==5 else (1 if x==1 or x==2 else np.nan))
- result['econ_state_05'] = data['f34_5'].apply(lambda x : 0 if x==4 or x==3 or x==5 else (1 if x==1 or x==2 else np.nan))
- result['econ_state_08'] = data['f34_8'].apply(lambda x : 0 if x==4 or x==3 or x==5 else (1 if x==1 or x==2 else np.nan))
- result['econ_state_11'] = data['f34_11'].apply(lambda x : 0 if x==4 or x==3 or x==5 else (1 if x==1 or x==2 else np.nan))
- result['econ_state_14'] = data['f34_14'].apply(lambda x : 0 if x==4 or x==3 or x==5 else (1 if x==1 or x==2 else np.nan))
- result['econ_state_18'] = data['f34_18'].apply(lambda x : 0 if x==4 or x==3 or x==5 else (1 if x==1 or x==2 else np.nan))
- # 上一年家庭收入 99998超过10万
- result['income_02'] = data['f35_2'].apply(lambda x : x if x== 99998 else np.nan)
- result['income_05'] = data['f35_5'].apply(lambda x : x if x== 99998 else np.nan)
- result['income_08'] = data['f35_8'].apply(lambda x : x if x== 99998 else np.nan)
- result['income_11'] = data['f35_11'].apply(lambda x : x if x== 99998 else np.nan)
- result['income_14'] = data['f35_14'].apply(lambda x : x if x== 99998 else np.nan)
- result['income_18'] = data['f35_18'].apply(lambda x : x if x== 99998 else np.nan)
- # 居住状态 1与家庭成员同住;2独居;3在机构居住
- result['co_residence_98'] = data['a51'].apply(lambda x : np.nan if x==9 else x)
- result['co_residence_00'] = data['a51_0'].apply(lambda x : np.nan if x==9 or x==-9 or x == -6 else x)
- result['co_residence_02'] = data['a51_2'].apply(lambda x : np.nan if x==9 or x==-9 or x == -8 or x == -6 else x)
- result['co_residence_05'] = data['a51_5'].apply(lambda x : np.nan if x==9 or x==-9 or x == -8 or x == -6 else x)
- result['co_residence_08'] = data['a51_8'].apply(lambda x : np.nan if x==9 or x==-9 or x == -8 or x == -7 else x)
- result['co_residence_11'] = data['a51_11'].apply(lambda x : np.nan if x==9 or x==-9 or x == -8 or x == -7 else x)
- result['co_residence_14'] = data['a51_14'].apply(lambda x : np.nan if x==9 else x)
- result['co_residence_18'] = data['a51_18'].apply(lambda x : np.nan if x==9 else x)
- # 目前是否吸烟 1是;2否
- result['smoke_98'] = data['d71'].apply(lambda x : np.nan if x==9 else x)
- result['smoke_00'] = data['d71_0'].apply(lambda x : np.nan if x==9 or x==-9 or x == -6 else x)
- result['smoke_02'] = data['d71_2'].apply(lambda x : np.nan if x==9 or x==-9 or x == -8 or x == -6 else x)
- result['smoke_05'] = data['d71_5'].apply(lambda x : np.nan if x==9 or x==-9 or x == -8 or x == -6 else x)
- result['smoke_08'] = data['d71_8'].apply(lambda x : np.nan if x==9 or x==-9 or x == -8 or x == -7 else x)
- result['smoke_11'] = data['d71_11'].apply(lambda x : np.nan if x==9 or x==-9 or x == -8 or x == -7 else x)
- result['smoke_14'] = data['d71_14'].apply(lambda x : np.nan if x==9 else x)
- result['smoke_18'] = data['d71_18'].apply(lambda x : np.nan if x==9 else x)
- # 目前是否饮酒 1是;2否
- result['drink_98'] = data['d81'].apply(lambda x : np.nan if x==9 else x)
- result['drink_00'] = data['d81_0'].apply(lambda x : np.nan if x==9 or x==-9 or x == -6 else x)
- result['drink_02'] = data['d81_2'].apply(lambda x : np.nan if x==9 or x==-9 or x == -8 or x == -6 else x)
- result['drink_05'] = data['d81_5'].apply(lambda x : np.nan if x==9 or x==-9 or x == -8 or x == -6 else x)
- result['drink_08'] = data['d81_8'].apply(lambda x : np.nan if x==9 or x==-9 or x == -8 or x == -7 else x)
- result['drink_11'] = data['d81_11'].apply(lambda x : np.nan if x==9 or x==-9 or x == -8 or x == -7 else x)
- result['drink_14'] = data['d81_14'].apply(lambda x : np.nan if x==9 else x)
- result['drink_18'] = data['d81_18'].apply(lambda x : np.nan if x==9 or x == 8 else x)
- # 目前是否锻炼
- result['exercise_98'] = data['d91'].apply(lambda x : np.nan if x==9 else x)
- result['exercise_00'] = data['d91_0'].apply(lambda x : np.nan if x==9 or x==-9 or x == -6 else x)
- result['exercise_02'] = data['d91_2'].apply(lambda x : np.nan if x==9 or x==-9 or x == -8 or x == -6 else x)
- result['exercise_05'] = data['d91_5'].apply(lambda x : np.nan if x==9 or x==-9 or x == -8 or x == -6 else x)
- result['exercise_08'] = data['d91_8'].apply(lambda x : np.nan if x==9 or x==-9 or x == -8 or x == -7 or x == 8 else x)
- result['exercise_11'] = data['d91_11'].apply(lambda x : np.nan if x==9 or x==-9 or x == -8 or x == -7 or x == 8 else x)
- result['exercise_14'] = data['d91_14'].apply(lambda x : np.nan if x==9 else x)
- result['exercise_18'] = data['d91_18'].apply(lambda x : np.nan if x==9 or x == 8 else x)
- # 健康状况变量 1very good; 2good; 3so so; 4bad; 5very bad;
- result['self_reported_helth_98'] = data['b12'].apply(lambda x : np.nan if x==9 or x==8 else x)
- result['self_reported_helth_00'] = data['b12_0'].apply(lambda x : np.nan if x==8 or x==-9 or x == -6 else x)
- result['self_reported_helth_02'] = data['b12_2'].apply(lambda x : np.nan if x==9 or x==-9 or x == -8 or x == -6 or x == 8 else x)
- result['self_reported_helth_05'] = data['b12_5'].apply(lambda x : np.nan if x==9 or x==-9 or x == -8 or x == -6 or x == 8 else x)
- result['self_reported_helth_08'] = data['b12_8'].apply(lambda x : np.nan if x==9 or x==-9 or x == -8 or x == -7 or x == 8 else x)
- result['self_reported_helth_11'] = data['b12_11'].apply(lambda x : np.nan if x==9 or x==-9 or x == -8 or x == -7 or x == 8 else x)
- result['self_reported_helth_14'] = data['b12_14'].apply(lambda x : np.nan if x==9 or x == 8 else x)
- result['self_reported_helth_18'] = data['b12_18'].apply(lambda x : np.nan if x==9 or x == 8 else x)
- # 慢性病
- result['chronic_00'] = data['g14a1_0'].apply(lambda x : np.nan if x==66 or x==89 or x==99 or x==-9 or x == -6 or x == -1 else x)
- result['chronic_02'] = data['g14a1_2'].apply(lambda x : np.nan if x==-9 or x == -8 or x == -6 or x == -1 else x)
- result['chronic_05'] = data['g14a1_5'].apply(lambda x : np.nan if x==888 or x==999 or x==-9 or x == -8 or x == -6 or x == -1 else x)
- result['chronic_08'] = data['g14a1_8'].apply(lambda x : np.nan if x==66 or x==88 or x == 99 or x==-9 or x == -8 or x == -7 or x == -1 else x)
- result['chronic_11'] = data['g14a1_11'].apply(lambda x : np.nan if x==66 or x==88 or x==99 or x==-9 or x == -8 or x == -7 or x==-1 else x)
- result['chronic_14'] = data['g14a1_14'].apply(lambda x : np.nan if x==66 or x==99 or x==88 or x == -1 else x)
- result['chronic_18'] = data['g14a1_18'].apply(lambda x : np.nan if x==66 or x==99 or x==88 or x == -1 else x)
-
- # 抑郁量表得分-only 18年 0无抑郁症;1有抑郁症
- # 简版流调中心抑郁量表(CESD-10)10个CESD项目,每个项目的分值范围为0到3分,将每个CESD项目的分值相加,得到总得分
- # 定义转换规则
- transformation_one = {1: 3, 2: 2, 3: 2, 4: 1, 5: 0}
- # 应用转换规则
- columns_cesd_one = ['b31_18', 'b32_18', 'b33_18', 'b34_18', 'b36_18', 'b38_18', 'b39_18']
- for column_one in columns_cesd_one:
- data[column_one] = data[column_one].map(transformation_one).fillna(np.nan) # 将8转换为缺失值
- # 定义转换规则
- transformation_two = {1: 0, 2: 1, 3: 1, 4: 2, 5: 3}
- # 应用转换规则
- columns_cesd_two = ['b35_18', 'b37_18', 'b310a_18']
- for column_two in columns_cesd_two:
- data[column_two] = data[column_two].map(transformation_two).fillna(np.nan) # 将8转换为缺失值
- result['cesd'] = data['b31_18'] + data['b32_18'] + data['b33_18'] + data['b34_18'] + data['b36_18'] + data['b38_18'] + data['b39_18'] + data['b35_18'] + data['b37_18'] + data['b310a_18']
- result['cesd_d'] = result['cesd'].apply(lambda x : 0 if x >= 0 and x <= 15 else (1 if x >=16 and x <= 30 else np.nan))
-
- # 日常生活活动能力 0无残疾;1有残疾
- # ADL6个项目bathing, dressing, eating, indoor transferring, toileting, and continence, 每个项目的分值范围是0到2分, 将每个ADL项目的得分相加,得到总得分
- columns_adl_00 = ['e1_0', 'e2_0', 'e3_0', 'e4_0', 'e5_0', 'e6_0']
- trans_adl(columns_adl_00, data, result, "adl_00")
- columns_adl_02 = ['e1_2', 'e2_2', 'e3_2', 'e4_2', 'e5_2', 'e6_2']
- trans_adl(columns_adl_02, data, result, "adl_02")
- columns_adl_05 = ['e1_5', 'e2_5', 'e3_5', 'e4_5', 'e5_5', 'e6_5']
- trans_adl(columns_adl_05, data, result, "adl_05")
- columns_adl_08 = ['e1_8', 'e2_8', 'e3_8', 'e4_8', 'e5_8', 'e6_8']
- trans_adl(columns_adl_08, data, result, "adl_08")
- columns_adl_11 = ['e1_11', 'e2_11', 'e3_11', 'e4_11', 'e5_11', 'e6_11']
- trans_adl(columns_adl_11, data, result, "adl_11")
- columns_adl_14 = ['e1_14', 'e2_14', 'e3_14', 'e4_14', 'e5_14', 'e6_14']
- trans_adl(columns_adl_14, data, result, "adl_14")
- columns_adl_18 = ['e1_18', 'e2_18', 'e3_18', 'e4_18', 'e5_18', 'e6_18']
- trans_adl(columns_adl_18, data, result, "adl_18")
- # 认知功能 0有认知功能障碍;1认知功能正常
- # 简易精神状态评价量表(Mini-mental State Examination, MMSE),该量表包括一般能力(12分),反应能力(3分),注意力与计算力(6分),回忆力(3分),语言理解
- # 与自我协调能力(6分)5个部分24个问题,总分30分,分数越高,表示认知功能水平越高
- columns_mmse = ["c11", "c12", "c13", "c14", "c15", "c21a", "c21b", "c21c", "c31a", "c31b", "c31c", "c31d", "c31e", "c32", "c41a", "c41b", "c41c", "c51a", "c51b", "c52", "c53a", "c53b", "c53c",
- "c11_0", "c12_0", "c13_0", "c14_0", "c15_0", "c21a_0", "c21b_0", "c21c_0", "c31a_0", "c31b_0", "c31c_0", "c31d_0", "c31e_0", "c32_0", "c41a_0", "c41b_0", "c41c_0", "c51a_0", "c51b_0", "c52_0", "c53a_0", "c53b_0", "c53c_0",
- "c11_2", "c12_2", "c13_2", "c14_2", "c15_2", "c21a_2", "c21b_2", "c21c_2", "c31a_2", "c31b_2", "c31c_2", "c31d_2", "c31e_2", "c32_2", "c41a_2", "c41b_2", "c41c_2", "c51a_2", "c51b_2", "c52_2", "c53a_2", "c53b_2", "c53c_2",
- "c11_5", "c12_5", "c13_5", "c14_5", "c15_5", "c21a_5", "c21b_5", "c21c_5", "c31a_5", "c31b_5", "c31c_5", "c31d_5", "c31e_5", "c32_5", "c41a_5", "c41b_5", "c41c_5", "c51a_5", "c51b_5", "c52_5", "c53a_5", "c53b_5", "c53c_5",
- "c11_8", "c12_8", "c13_8", "c14_8", "c15_8", "c21a_8", "c21b_8", "c21c_8", "c31a_8", "c31b_8", "c31c_8", "c31d_8", "c31e_8", "c32_8", "c41a_8", "c41b_8", "c41c_8", "c51a_8", "c51b_8", "c52_8", "c53a_8", "c53b_8", "c53c_8",
- "c11_11", "c12_11", "c13_11", "c14_11", "c15_11", "c21a_11", "c21b_11", "c21c_11", "c31a_11", "c31b_11", "c31c_11", "c31d_11", "c31e_11", "c32_11", "c41a_11", "c41b_11", "c41c_11", "c51a_11", "c51b_11", "c52_11", "c53a_11", "c53b_11", "c53c_11",
- "c11_14", "c12_14", "c13_14", "c14_14", "c15_14", "c21a_14", "c21b_14", "c21c_14", "c31a_14", "c31b_14", "c31c_14", "c31d_14", "c31e_14", "c32_14", "c41a_14", "c41b_14", "c41c_14", "c51a_14", "c51b_14", "c52_14", "c53a_14", "c53b_14", "c53c_14",
- "c11_18", "c12_18", "c13_18", "c14_18", "c15_18", "c21a_18", "c21b_18", "c21c_18", "c31a_18", "c31b_18", "c31c_18", "c31d_18", "c31e_18", "c32_18", "c41a_18", "c41b_18", "c41c_18", "c51a_18", "c51b_18", "c52_18", "c53a_18", "c53b_18", "c53c_18"]
- trans_mmse(columns_mmse, data)
- columns_mmse_c16 = ["c16", "c16_0", "c16_2", "c16_5", "c16_8", "c16_11", "c16_14", "c16_18"]
- trans_mmse_c16(columns_mmse_c16, data)
- columns_cognitive_98 = ["c11", "c12", "c13", "c14", "c15", "c16"]
- columns_reaction_98 = ["c21a", "c21b", "c21c"]
- columns_attention_98 = ["c31a", "c31b", "c31c", "c31d", "c31e", "c32"]
- columns_memory_98 = ["c41a", "c41b", "c41c"]
- columns_language_98 = ["c51a", "c51b", "c52", "c53a", "c53b", "c53c"]
- get_mmse(columns_cognitive_98,columns_reaction_98,columns_attention_98, columns_memory_98 ,columns_language_98, data, result, "98")
- columns_cognitive_00 = ["c11_0", "c12_0", "c13_0", "c14_0", "c15_0", "c16_0"]
- columns_reaction_00 = ["c21a_0", "c21b_0", "c21c_0"]
- columns_attention_00 = ["c31a_0", "c31b_0", "c31c_0", "c31d_0", "c31e_0", "c32_0"]
- columns_memory_00 = ["c41a_0", "c41b_0", "c41c_0"]
- columns_language_00 = ["c51a_0", "c51b_0", "c52_0", "c53a_0", "c53b_0", "c53c_0"]
- get_mmse(columns_cognitive_00,columns_reaction_00,columns_attention_00, columns_memory_00 ,columns_language_00, data, result, "00")
- columns_cognitive_02 = ["c11_2", "c12_2", "c13_2", "c14_2", "c15_2", "c16_2"]
- columns_reaction_02 = ["c21a_2", "c21b_2", "c21c_2"]
- columns_attention_02 = ["c31a_2", "c31b_2", "c31c_2", "c31d_2", "c31e_2", "c32_2"]
- columns_memory_02 = ["c41a_2", "c41b_2", "c41c_2"]
- columns_language_02 = ["c51a_2", "c51b_2", "c52_2", "c53a_2", "c53b_2", "c53c_2"]
- get_mmse(columns_cognitive_02,columns_reaction_02,columns_attention_02, columns_memory_02 ,columns_language_02, data, result, "02")
- columns_cognitive_05 = ["c11_5", "c12_5", "c13_5", "c14_5", "c15_5", "c16_5"]
- columns_reaction_05 = ["c21a_5", "c21b_5", "c21c_5"]
- columns_attention_05 = ["c31a_5", "c31b_5", "c31c_5", "c31d_5", "c31e_5", "c32_5"]
- columns_memory_05 = ["c41a_5", "c41b_5", "c41c_5"]
- columns_language_05 = ["c51a_5", "c51b_5", "c52_5", "c53a_5", "c53b_5", "c53c_5"]
- get_mmse(columns_cognitive_05,columns_reaction_05,columns_attention_05, columns_memory_05 ,columns_language_05, data, result, "05")
- columns_cognitive_08 = ["c11_8", "c12_8", "c13_8", "c14_8", "c15_8", "c16_8"]
- columns_reaction_08 = ["c21a_8", "c21b_8", "c21c_8"]
- columns_attention_08 = ["c31a_8", "c31b_8", "c31c_8", "c31d_8", "c31e_8", "c32_8"]
- columns_memory_08 = ["c41a_8", "c41b_8", "c41c_8"]
- columns_language_08 = ["c51a_8", "c51b_8", "c52_8", "c53a_8", "c53b_8", "c53c_8"]
- get_mmse(columns_cognitive_08,columns_reaction_08,columns_attention_08, columns_memory_08 ,columns_language_08, data, result, "08")
- columns_cognitive_11 = ["c11_11", "c12_11", "c13_11", "c14_11", "c15_11", "c16_11"]
- columns_reaction_11 = ["c21a_11", "c21b_11", "c21c_11"]
- columns_attention_11 = ["c31a_11", "c31b_11", "c31c_11", "c31d_11", "c31e_11", "c32_11"]
- columns_memory_11 = ["c41a_11", "c41b_11", "c41c_11"]
- columns_language_11 = ["c51a_11", "c51b_11", "c52_11", "c53a_11", "c53b_11", "c53c_11"]
- get_mmse(columns_cognitive_11,columns_reaction_11,columns_attention_11, columns_memory_11 ,columns_language_11, data, result, "11")
- columns_cognitive_14 = ["c11_14", "c12_14", "c13_14", "c14_14", "c15_14", "c16_14"]
- columns_reaction_14 = ["c21a_14", "c21b_14", "c21c_14"]
- columns_attention_14 = ["c31a_14", "c31b_14", "c31c_14", "c31d_14", "c31e_14", "c32_14"]
- columns_memory_14 = ["c41a_14", "c41b_14", "c41c_14"]
- columns_language_14 = ["c51a_14", "c51b_14", "c52_14", "c53a_14", "c53b_14", "c53c_14"]
- get_mmse(columns_cognitive_14,columns_reaction_14,columns_attention_14, columns_memory_14 ,columns_language_14, data, result, "14")
- columns_cognitive_18 = ["c11_18", "c12_18", "c13_18", "c14_18", "c15_18", "c16_18"]
- columns_reaction_18 = ["c21a_18", "c21b_18", "c21c_18"]
- columns_attention_18 = ["c31a_18", "c31b_18", "c31c_18", "c31d_18", "c31e_18", "c32_18"]
- columns_memory_18 = ["c41a_18", "c41b_18", "c41c_18"]
- columns_language_18 = ["c51a_18", "c51b_18", "c52_18", "c53a_18", "c53b_18", "c53c_18"]
- get_mmse(columns_cognitive_18,columns_reaction_18,columns_attention_18, columns_memory_18 ,columns_language_18, data, result, "18")
- # #血液指标
- # columns_to_exclude = ['midn', 'trueage', 'a1'] # 替换为你要排除的列名
- # #2008
- # biomarker_08 = pd.read_csv("CLHLS/biomarker_dataset_CLHLS_2008-1.tab", sep='\t')
- # biomarker_08 = biomarker_08.drop(columns=columns_to_exclude)
- # columns_bio = ["id"]
- # for col in biomarker_08.columns:
- # if not col == "id":
- # columns_bio.append(col+"_08")
- # biomarker_08.columns = columns_bio
- # result = pd.merge(result, biomarker_08, on = ["id"], how="left")
- print(result.head())
- result.to_csv("CLHLS/clhls_1998_2018_result.csv", index=False)
- def deal_2008_2018_data():
- sav_file_path = "CLHLS/clhls_2008_2018_longitudinal_dataset_released_version1.sav"
- csv_file_path = "CLHLS/clhls_2008_2018_longitudinal_dataset_released_version1.csv"
- # 将sav数据转为csv
- # sav2csv(sav_file_path, csv_file_path)
- #处理数据
- data = pd.read_csv(csv_file_path)
- # 存活状态0存活;1死亡;-9失访;-8死亡/失访
- result = data[['id', 'dth08_11', 'dth11_14', 'dth14_18']]
- # 人口特征学变量
- # 8/9代表无法回答和缺失
- # 年龄
- result[['trueage_08','trueage_11', 'trueage_14', 'trueage_18']] = data[['trueage','vage_11', 'trueage_14', 'trueage_18']]
- # 性别 1男;0女
- result['sex'] = data['a1'].apply(lambda x : 1 if x==1 else 0)
- # 民族 1汉族;0非汉族
- result['ethnic'] = data['a2'].apply(lambda x : 1 if x==1 else 0)
- # 出生地 1城市;0农村
- result['birth_place'] = data['a43'].apply(lambda x : 1 if x == 1 else (0 if x == 2 else np.nan))
- # 教育状况 无11年
- result['edu_08'] = data['f1'].apply(lambda x : np.nan if x==88 or x==99 else x)
- result['edu_14'] = data['f1_14'].apply(lambda x : np.nan if x==88 or x==99 else x)
- result['edu_18'] = data['f1_18'].apply(lambda x : np.nan if x==88 or x==99 else x)
- # 婚姻状况 0separated/divorced/widowed/never married; 1currently married and living with spouse
- result['marital_08'] = data['f41'].apply(lambda x : 0 if x==2 or x==3 or x==4 or x==5 else (np.nan if x==9 else 1))
- result['marital_11'] = data['f41_11'].apply(lambda x : 0 if x==2 or x==3 or x==4 or x==5 else (np.nan if x==9 or x==-9 or x==-8 or x==-7 else 1))
- result['marital_14'] = data['f41_14'].apply(lambda x : 0 if x==2 or x==3 or x==4 or x==5 else (np.nan if x==9 else 1))
- result['marital_18'] = data['f41_18'].apply(lambda x : 0 if x==2 or x==3 or x==4 or x==5 else (np.nan if x==9 else 1))
- # 生活是否富裕 1富裕及以上;0一般及以下
- result['econ_state_08'] = data['f34'].apply(lambda x : 0 if x==2 or x==3 else (1 if x==1 else np.nan))
- result['econ_state_11'] = data['f34_11'].apply(lambda x : 0 if x==4 or x==3 or x==5 else (1 if x==1 or x==2 else np.nan))
- result['econ_state_14'] = data['f34_14'].apply(lambda x : 0 if x==4 or x==3 or x==5 else (1 if x==1 or x==2 else np.nan))
- result['econ_state_18'] = data['f34_18'].apply(lambda x : 0 if x==4 or x==3 or x==5 else (1 if x==1 or x==2 else np.nan))
- # 上一年家庭收入 99998超过10万
- result['income_08'] = data['f35'].apply(lambda x : x if x== 99998 else np.nan)
- result['income_11'] = data['f35_11'].apply(lambda x : x if x== 99998 else np.nan)
- result['income_14'] = data['f35_14'].apply(lambda x : x if x== 99998 else np.nan)
- result['income_18'] = data['f35_18'].apply(lambda x : x if x== 99998 else np.nan)
- # 居住状态 1与家庭成员同住;2独居;3在机构居住
- result['co_residence_08'] = data['a51'].apply(lambda x : np.nan if x==9 else x)
- result['co_residence_11'] = data['a51_11'].apply(lambda x : np.nan if x==9 or x==-9 or x == -8 or x == -7 else x)
- result['co_residence_14'] = data['a51_14'].apply(lambda x : np.nan if x==9 else x)
- result['co_residence_18'] = data['a51_18'].apply(lambda x : np.nan if x==9 else x)
- # 目前是否吸烟 1是;2否
- result['smoke_08'] = data['d71'].apply(lambda x : np.nan if x==9 else x)
- result['smoke_11'] = data['d71_11'].apply(lambda x : np.nan if x==9 or x==-9 or x == -8 or x == -7 else x)
- result['smoke_14'] = data['d71_14'].apply(lambda x : np.nan if x==9 else x)
- result['smoke_18'] = data['d71_18'].apply(lambda x : np.nan if x==9 else x)
- # 目前是否饮酒 1是;2否
- result['drink_08'] = data['d81'].apply(lambda x : np.nan if x==9 else x)
- result['drink_11'] = data['d81_11'].apply(lambda x : np.nan if x==9 or x==-9 or x == -8 or x == -7 else x)
- result['drink_14'] = data['d81_14'].apply(lambda x : np.nan if x==9 else x)
- result['drink_18'] = data['d81_18'].apply(lambda x : np.nan if x==9 or x == 8 else x)
- # 目前是否锻炼
- result['exercise_08'] = data['d91'].apply(lambda x : np.nan if x==9 else x)
- result['exercise_11'] = data['d91_11'].apply(lambda x : np.nan if x==9 or x==-9 or x == -8 or x == -7 or x == 8 else x)
- result['exercise_14'] = data['d91_14'].apply(lambda x : np.nan if x==9 else x)
- result['exercise_18'] = data['d91_18'].apply(lambda x : np.nan if x==9 or x == 8 else x)
- # 健康状况变量 1very good; 2good; 3so so; 4bad; 5very bad;
- result['self_reported_helth_08'] = data['b12'].apply(lambda x : np.nan if x==9 or x==8 else x)
- result['self_reported_helth_11'] = data['b12_11'].apply(lambda x : np.nan if x==9 or x==-9 or x == -8 or x == -7 or x == 8 else x)
- result['self_reported_helth_14'] = data['b12_14'].apply(lambda x : np.nan if x==9 or x == 8 else x)
- result['self_reported_helth_18'] = data['b12_18'].apply(lambda x : np.nan if x==9 or x == 8 else x)
- # 慢性病
- result['chronic_08'] = data['g14a1'].apply(lambda x : np.nan if x==66 or x==89 or x==99 or x==-9 or x == -6 or x == -1 else x)
- result['chronic_11'] = data['g14a1_11'].apply(lambda x : np.nan if x==66 or x==88 or x==99 or x==-9 or x == -8 or x == -7 or x==-1 else x)
- result['chronic_14'] = data['g14a1_14'].apply(lambda x : np.nan if x==66 or x==99 or x==88 or x == -1 else x)
- result['chronic_18'] = data['g14a1_18'].apply(lambda x : np.nan if x==66 or x==99 or x==88 or x == -1 else x)
-
- # 抑郁量表得分-only 18年 0无抑郁症;1有抑郁症
- # 简版流调中心抑郁量表(CESD-10)10个CESD项目,每个项目的分值范围为0到3分,将每个CESD项目的分值相加,得到总得分
- # 定义转换规则
- transformation_one = {1: 3, 2: 2, 3: 2, 4: 1, 5: 0}
- # 应用转换规则
- columns_cesd_one = ['b31_18', 'b32_18', 'b33_18', 'b34_18', 'b36_18', 'b38_18', 'b39_18']
- for column_one in columns_cesd_one:
- data[column_one] = data[column_one].map(transformation_one).fillna(np.nan) # 将8转换为缺失值
- # 定义转换规则
- transformation_two = {1: 0, 2: 1, 3: 1, 4: 2, 5: 3}
- # 应用转换规则
- columns_cesd_two = ['b35_18', 'b37_18', 'b310a_18']
- for column_two in columns_cesd_two:
- data[column_two] = data[column_two].map(transformation_two).fillna(np.nan) # 将8转换为缺失值
- result['cesd'] = data['b31_18'] + data['b32_18'] + data['b33_18'] + data['b34_18'] + data['b36_18'] + data['b38_18'] + data['b39_18'] + data['b35_18'] + data['b37_18'] + data['b310a_18']
- result['cesd_d'] = result['cesd'].apply(lambda x : 0 if x >= 0 and x <= 15 else (1 if x >=16 and x <= 30 else np.nan))
-
- # 日常生活活动能力 0无残疾;1有残疾
- # ADL6个项目bathing, dressing, eating, indoor transferring, toileting, and continence, 每个项目的分值范围是0到2分, 将每个ADL项目的得分相加,得到总得分
- columns_adl_08 = ['e1', 'e2', 'e3', 'e4', 'e5', 'e6']
- trans_adl(columns_adl_08, data, result, "adl_08")
- columns_adl_11 = ['e1_11', 'e2_11', 'e3_11', 'e4_11', 'e5_11', 'e6_11']
- trans_adl(columns_adl_11, data, result, "adl_11")
- columns_adl_14 = ['e1_14', 'e2_14', 'e3_14', 'e4_14', 'e5_14', 'e6_14']
- trans_adl(columns_adl_14, data, result, "adl_14")
- columns_adl_18 = ['e1_18', 'e2_18', 'e3_18', 'e4_18', 'e5_18', 'e6_18']
- trans_adl(columns_adl_18, data, result, "adl_18")
- # 认知功能 0有认知功能障碍;1认知功能正常
- # 简易精神状态评价量表(Mini-mental State Examination, MMSE),该量表包括一般能力(12分),反应能力(3分),注意力与计算力(6分),回忆力(3分),语言理解
- # 与自我协调能力(6分)5个部分24个问题,总分30分,分数越高,表示认知功能水平越高
- columns_mmse = ["c11", "c12", "c13", "c14", "c15", "c21a", "c21b", "c21c", "c31a", "c31b", "c31c", "c31d", "c31e", "c32", "c41a", "c41b", "c41c", "c51a", "c51b", "c52", "c53a", "c53b", "c53c",
- "c11_11", "c12_11", "c13_11", "c14_11", "c15_11", "c21a_11", "c21b_11", "c21c_11", "c31a_11", "c31b_11", "c31c_11", "c31d_11", "c31e_11", "c32_11", "c41a_11", "c41b_11", "c41c_11", "c51a_11", "c51b_11", "c52_11", "c53a_11", "c53b_11", "c53c_11",
- "c11_14", "c12_14", "c13_14", "c14_14", "c15_14", "c21a_14", "c21b_14", "c21c_14", "c31a_14", "c31b_14", "c31c_14", "c31d_14", "c31e_14", "c32_14", "c41a_14", "c41b_14", "c41c_14", "c51a_14", "c51b_14", "c52_14", "c53a_14", "c53b_14", "c53c_14",
- "c11_18", "c12_18", "c13_18", "c14_18", "c15_18", "c21a_18", "c21b_18", "c21c_18", "c31a_18", "c31b_18", "c31c_18", "c31d_18", "c31e_18", "c32_18", "c41a_18", "c41b_18", "c41c_18", "c51a_18", "c51b_18", "c52_18", "c53a_18", "c53b_18", "c53c_18"]
- trans_mmse(columns_mmse, data)
- columns_mmse_c16 = ["c16", "c16_11", "c16_14", "c16_18"]
- trans_mmse_c16(columns_mmse_c16, data)
- columns_cognitive_08 = ["c11", "c12", "c13", "c14", "c15", "c16"]
- columns_reaction_08 = ["c21a", "c21b", "c21c"]
- columns_attention_08 = ["c31a", "c31b", "c31c", "c31d", "c31e", "c32"]
- columns_memory_08 = ["c41a", "c41b", "c41c"]
- columns_language_08 = ["c51a", "c51b", "c52", "c53a", "c53b", "c53c"]
- get_mmse(columns_cognitive_08,columns_reaction_08,columns_attention_08, columns_memory_08 ,columns_language_08, data, result, "08")
- columns_cognitive_11 = ["c11_11", "c12_11", "c13_11", "c14_11", "c15_11", "c16_11"]
- columns_reaction_11 = ["c21a_11", "c21b_11", "c21c_11"]
- columns_attention_11 = ["c31a_11", "c31b_11", "c31c_11", "c31d_11", "c31e_11", "c32_11"]
- columns_memory_11 = ["c41a_11", "c41b_11", "c41c_11"]
- columns_language_11 = ["c51a_11", "c51b_11", "c52_11", "c53a_11", "c53b_11", "c53c_11"]
- get_mmse(columns_cognitive_11,columns_reaction_11,columns_attention_11, columns_memory_11 ,columns_language_11, data, result, "11")
- columns_cognitive_14 = ["c11_14", "c12_14", "c13_14", "c14_14", "c15_14", "c16_14"]
- columns_reaction_14 = ["c21a_14", "c21b_14", "c21c_14"]
- columns_attention_14 = ["c31a_14", "c31b_14", "c31c_14", "c31d_14", "c31e_14", "c32_14"]
- columns_memory_14 = ["c41a_14", "c41b_14", "c41c_14"]
- columns_language_14 = ["c51a_14", "c51b_14", "c52_14", "c53a_14", "c53b_14", "c53c_14"]
- get_mmse(columns_cognitive_14,columns_reaction_14,columns_attention_14, columns_memory_14 ,columns_language_14, data, result, "14")
- columns_cognitive_18 = ["c11_18", "c12_18", "c13_18", "c14_18", "c15_18", "c16_18"]
- columns_reaction_18 = ["c21a_18", "c21b_18", "c21c_18"]
- columns_attention_18 = ["c31a_18", "c31b_18", "c31c_18", "c31d_18", "c31e_18", "c32_18"]
- columns_memory_18 = ["c41a_18", "c41b_18", "c41c_18"]
- columns_language_18 = ["c51a_18", "c51b_18", "c52_18", "c53a_18", "c53b_18", "c53c_18"]
- get_mmse(columns_cognitive_18,columns_reaction_18,columns_attention_18, columns_memory_18 ,columns_language_18, data, result, "18")
- # #血液指标
- #2008
- columns_bio = ["id"]
- biomarker = pd.read_csv("CLHLS/biomarker_dataset_CLHLS_2008.tab", sep='\t')
- biomarker = biomarker.loc[:,["id", "plt", "lymph", "hdl"]]
- for col in biomarker.columns:
- if not col == "id":
- columns_bio.append(col+"_08")
- biomarker.columns = columns_bio
- result = pd.merge(result, biomarker, on = ["id"], how="left")
- #2012
- columns_bio = ["id"]
- biomarker = pd.read_csv("CLHLS/biomarker_dataset_CLHLS_2012.tab", sep='\t')
- biomarker = biomarker.loc[:,["id", "plt", "lymph", "hdlc"]]
- for col in biomarker.columns:
- if not col == "id":
- columns_bio.append(col+"_12")
- biomarker.columns = columns_bio
- result = pd.merge(result, biomarker, on = ["id"], how="left")
- #2014
- columns_bio = ["id"]
- biomarker = pd.read_csv("CLHLS/biomarker_dataset_CLHLS_2014.tab", sep='\t')
- biomarker = biomarker.loc[:,["id", "plt", "lymph", "hdlc"]]
- for col in biomarker.columns:
- if not col == "id":
- columns_bio.append(col+"_14")
- biomarker.columns = columns_bio
- result = pd.merge(result, biomarker, on = ["id"], how="left")
- print(result.head())
- result.to_csv("CLHLS/clhls_2008_2018_result.csv", index=False)
- if __name__ == "__main__":
- # deal_1998_2018_data()
- deal_2008_2018_data()
- print(123)
|