Kaynağa Gözat

增加论文数据处理及论文代码

JazzZhao 1 ay önce
ebeveyn
işleme
a4e9339d5c

+ 72 - 3
CHARLS_P/CHARLS_harmonized.py

@@ -1,6 +1,8 @@
 import pandas as pd
 import numpy as np
 import pyreadstat
+from datetime import date
+from lunarcalendar import Converter, Lunar
 
 
 # 定义一个函数,用于更新 harmonized 中的 mstat 列
@@ -43,6 +45,43 @@ def merge_data(harmonized, waves, flag="other"):
         )[col_name])
     return merged_data
 
+# 通过 groupby 采用少数服从多数原则填充性别
+def fill_gender(group, col):
+    # 计算性别众数
+    mode_gender = group[col].mode()
+    if not mode_gender.empty:
+        # 用众数替换组内所有性别值
+        group[col] = mode_gender[0]
+    return group
+
+def calculate_age(row):
+    # 检查空值
+    if pd.isnull(row['birth_year']) or pd.isnull(row['birth_month']) or pd.isnull(row['iyear']) or pd.isnull(row['imonth']):
+        return np.nan  # 返回 NaN 代表无法计算年龄
+
+    # 获取出生年月
+    birth_year = int(row['birth_year'])
+    birth_month = int(row['birth_month'])
+    if birth_month == 0:
+        birth_month = 6
+    # 确定出生日期
+    if row['ba003'] == 1:
+        # 公历
+        birth_date = date(birth_year, birth_month, 1)
+    else:
+        lunar = Lunar(birth_year, birth_month, 1, isleap=False)
+        # 农历
+        birth_date = Converter.Lunar2Solar(lunar)
+    
+    # 获取随访年月
+    followup_year = int(row['iyear'])
+    followup_month = int(row['imonth'])
+    followup_date = date(followup_year, followup_month, 1)
+    
+    # 计算年龄
+    age = followup_date.year - birth_date.year - ((followup_date.month, followup_date.day) < (birth_date.month, birth_date.day))
+    return age    
+
 if __name__ == "__main__":
     harmonized, meta = pyreadstat.read_dta("/root/r_base/CHARLS/Harmonized_CHARLS/H_CHARLS_D_Data.dta")
     CHARLS_data = pd.read_csv("CHARLS_data_pollutants_p_n_m_nd.csv")
@@ -70,7 +109,7 @@ if __name__ == "__main__":
 
     #BMI
     waves = [(2011, "r1mbmi"), (2013, "r2mbmi"), (2015, "r3mbmi")]
-    CHARLS_data["BMI"] = pd.concat(merge_data(harmonized, waves), ignore_index=True)
+    CHARLS_data["BMI_m"] = pd.concat(merge_data(harmonized, waves), ignore_index=True)
 
     #收缩压#舒张压
     waves = [(2011, "r1systo"), (2013, "r2systo"), (2015, "r3systo")]
@@ -135,6 +174,14 @@ if __name__ == "__main__":
     waves = [(2011, "r1sleeprl"), (2013, "r2sleeprl"), (2015, "r3sleeprl"), (2018, "r4sleeprl")]
     CHARLS_data["sleep_state_m"] = pd.concat(merge_data(harmonized, waves), ignore_index=True)
 
+    # ADL
+    waves = [(2011, "s1adlab_c"), (2013, "s2adlab_c"), (2015, "s3adlab_c"), (2018, "s4adlab_c")]
+    CHARLS_data["ADL_m"] = pd.concat(merge_data(harmonized, waves), ignore_index=True)
+
+    #年龄
+    waves = [(2011, "r1agey"), (2013, "r2agey"), (2015, "r3agey"), (2018, "r4agey")]
+    CHARLS_data["age_m"] = pd.concat(merge_data(harmonized, waves), ignore_index=True)
+
     #计算认知功能得分,分成三部分:电话问卷9分,词语回忆10分、画图1分
     waves = [(2011, "r1orient"), (2013, "r2orient"), (2015, "r3orient"), (2018, "r4orient")]
     CHARLS_data["Date_Naming"] = pd.concat(merge_data(harmonized, waves), ignore_index=True)
@@ -162,7 +209,10 @@ if __name__ == "__main__":
                   	"Physical_activity_m",	"Smoke_m",	'Drink_m',	'Hypertension_m',	'Disabetes_or_High_Blood_Sugar_m',
                     'Cancer_or_Malignant_Tumor_m',	'Chronic_Lung_Diseases_m',	'Heart_Problems_m',	'Emotional_Nervous_or_Psychiatric_Problems_m',
                 	'Stroke_m',	'Arthritis_or_Rheumatism_m',	'Dyslipidemia_m',	'Liver_Disease_m',	'Kidney_Diease_m',	'Stomach_or_Other_Digestive_Disease_m',
-                	'Asthma_m',	'Memory_Related_Disease_m',	'Psychiatric_score_m',	'sleep_state_m', 'Cognition_score_m']
+                	'Asthma_m',	'Memory_Related_Disease_m',	'Psychiatric_score_m',	'sleep_state_m', 'Cognition_score_m', "age_m", "ADL_m"]
+    
+    #先处理身高1~2单位为米,大于3为cm
+    CHARLS_data['Height'] = CHARLS_data['Height'].apply(lambda x: x if 1 <= x <= 2 else (x / 100 if x > 3 else x))
     
     # 遍历 merge_list 列表
     for col_m in merge_list:
@@ -170,6 +220,10 @@ if __name__ == "__main__":
         if col in CHARLS_data.columns and col_m in CHARLS_data.columns:
             CHARLS_data[col] = CHARLS_data[col_m].fillna(CHARLS_data[col])
 
+    # 计算BMI
+    CHARLS_data['BMI'] = CHARLS_data['Weight'] /(CHARLS_data['Height'] * CHARLS_data['Height'])
+    CHARLS_data['BMI'] = CHARLS_data["BMI_m"].fillna(CHARLS_data['BMI'])
+
     # 处理慢性病标准不一样,将2变为0
     chronic_disease = ['Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases', 
                   'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease', 
@@ -183,5 +237,20 @@ if __name__ == "__main__":
         if col in CHARLS_data.columns and col_m in CHARLS_data.columns:
             CHARLS_data[col] = CHARLS_data[col_m].fillna(CHARLS_data[col])
 
-    CHARLS_data = CHARLS_data.drop(columns=["Date_Naming", "Immediate_Word_Recall", "Delayed_Word_Recall", "Serial_7", "Drawing_Picture"] + merge_list+ common_new_list)
+    CHARLS_data = CHARLS_data.drop(columns=["Date_Naming", "Immediate_Word_Recall", "Delayed_Word_Recall", "Serial_7", "Drawing_Picture", "BMI_m"] + merge_list+ common_new_list)
+    
+    #处理性别
+    CHARLS_data = CHARLS_data.groupby('ID').apply(lambda group: fill_gender(group, "rgender")).reset_index(drop=True)
+
+    #处理出生年月
+    CHARLS_data = CHARLS_data.groupby('ID').apply(lambda group: fill_gender(group, "birth_year")).reset_index(drop=True)
+    CHARLS_data = CHARLS_data.groupby('ID').apply(lambda group: fill_gender(group, "birth_month")).reset_index(drop=True)
+
+    #处理教育
+    CHARLS_data = CHARLS_data.groupby('ID').apply(lambda group: fill_gender(group, "education")).reset_index(drop=True)
+    
+    #重新计算年龄
+    CHARLS_data["age"] = CHARLS_data.apply(calculate_age, axis=1)
+
+    
     CHARLS_data.to_csv("CHARLS_data_pollutants_p_n_m_nd_h.csv", index=False)

+ 93 - 12
CHARLS_P/CHARLS_preprocess_main.py

@@ -1,10 +1,12 @@
 import pandas as pd
 import numpy as np
 import pyreadstat
+from datetime import date
+from lunarcalendar import Converter, Lunar
 
 #统一列名
 def change_columns(df):
-    df.columns = ["ID",'householdID','communityID','rgender', "birth_year", "marital_status" , "education", 'province', 'city',"Height", "Weight",
+    df.columns = ["ID",'householdID','communityID','rgender', "birth_year", "birth_month", "ba003", "iyear", "imonth", "marital_status" , "education", 'province', 'city',"Height", "Weight",
                   "waist", "Systolic","Diastolic",
 
                   'bl_wbc','bl_mcv','bl_plt','bl_bun','bl_glu','bl_crea','bl_cho', 'bl_tg', 'bl_hdl', 'bl_ldl','bl_crp', 
@@ -18,7 +20,7 @@ def change_columns(df):
                   
                   'Smoke','Drink',
                   
-                  "Cognition_score", "Psychiatric_score","sleep_state", "wave"
+                  "Cognition_score", "Psychiatric_score","sleep_state", "ADL", "wave",
                   ]
 # 2020年把帕金森和记忆病症分开,需要和以前对齐   
 def process_row(row):
@@ -54,7 +56,7 @@ if __name__ == "__main__":
     health_status, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/health_status_and_functioning.dta")
     health_care, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/health_care_and_insurance.dta")
     exp_income, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/exp_income_wealth.dta")
-
+    weight, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/weight.dta")
     #性别#年龄#居住地#婚姻状况
     # 1 married or partnered
     # 0 other marital status (separated, divorced, unmarried, or widowed)
@@ -65,8 +67,11 @@ if __name__ == "__main__":
     # 1 high school
     # 2 college or above
     demo["education"] = demo["bd001"].apply(lambda x : 1 if x == 6 or x == 7 else 2 if x in [8, 9, 10, 11] else 0 if x in [1,2,3,4,5] else np.nan)
+    
+    #获取随访时间
+    demo = pd.merge(demo, weight[["ID", "iyear", "imonth"]], on = "ID", how="left")
 
-    data_2011 = demo[['ID','householdID', 'communityID','rgender','ba002_1','marital_status', 'education']]
+    data_2011 = demo[['ID','householdID', 'communityID','rgender','ba002_1', 'ba002_2','ba003',"iyear", "imonth" ,'marital_status', 'education']]
 
     #居住地
     data_2011 = pd.merge(data_2011, psu[['communityID', 'province', 'city']], on = "communityID", how="left")
@@ -205,6 +210,16 @@ if __name__ == "__main__":
     # (4)Most or all of the time (5-7 days) 大多数的时间(5-7天) 
     data_2011["sleep_state"] = health_status['dc015']
 
+    #ADL
+    health_status["db010_score"] = health_status["db010"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
+    health_status["db011_score"] = health_status["db011"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
+    health_status["db012_score"] = health_status["db012"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
+    health_status["db013_score"] = health_status["db013"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
+    health_status["db014_score"] = health_status["db014"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
+    health_status["db015_score"] = health_status["db015"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
+    data_2011["ADL"] = health_status["db010_score"] + health_status["db011_score"] + health_status["db012_score"] + health_status["db013_score"] + \
+                        health_status["db014_score"] + health_status["db015_score"]
+
     data_2011["wave"] = year
     change_columns(data_2011)
     # 2011年的ID和其他年份有一点区别,倒数第三位加0
@@ -218,6 +233,7 @@ if __name__ == "__main__":
     health_status, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Health_Status_and_Functioning.dta")
     health_care, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Health_Care_and_Insurance.dta")
     exp_income, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/exp_income_wealth.dta")
+    weight, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Weights.dta")
 
     #性别#年龄#婚姻状况
     # 1 married or partnered
@@ -252,21 +268,27 @@ if __name__ == "__main__":
 
     # 纠正2011年统计错误的出生年
     demo["birth_year"] = demo.apply(lambda x : x["ba002_1"] if not pd.isna(x["ba002_1"]) else np.nan, axis=1)
-    birth_year_2013 = demo[['ID',"birth_year"]]
+    demo["birth_month"] = demo.apply(lambda x : x["ba002_2"] if not pd.isna(x["ba002_2"]) else np.nan, axis=1)
+    birth_year_2013 = demo[['ID',"birth_year", "birth_month"]]
     # 按 'ID' 列合并两个表
     data_2011 = pd.merge(data_2011, birth_year_2013, on='ID', how='left', suffixes=("_2011","_2013"))
     # 使用 fillna() 来更新字段
     data_2011['birth_year'] = data_2011['birth_year_2013'].fillna(data_2011['birth_year_2011'])
+    data_2011['birth_month'] = data_2011['birth_month_2013'].fillna(data_2011['birth_month_2011'])
     # 删除多余的列
-    data_2011 = data_2011.drop(columns=['birth_year_2013', 'birth_year_2011'])
+    data_2011 = data_2011.drop(columns=['birth_year_2013', 'birth_year_2011', 'birth_month_2013', 'birth_month_2011'])
     #合并2011年的出生年
-    birth_year_2011 = data_2011[['ID',"birth_year"]]
+    birth_year_2011 = data_2011[['ID',"birth_year", "birth_month"]]
     # 按 'ID' 列合并两个表
     demo = pd.merge(demo, birth_year_2011, on='ID', how='left', suffixes=("_2013","_2011"))
     # 使用 fillna() 来更新字段
     demo['birth_year'] = demo['birth_year_2013'].fillna(demo['birth_year_2011'])
+    demo['birth_month'] = demo['birth_month_2013'].fillna(demo['birth_month_2011'])
+
+    #获取随访时间
+    demo = pd.merge(demo, weight[["ID", "iyear", "imonth"]], on = "ID", how="left")
 
-    data_2013 = demo[['ID','householdID', 'communityID','ba000_w2_3','birth_year','marital_status', "education"]]
+    data_2013 = demo[['ID','householdID', 'communityID','ba000_w2_3','birth_year','birth_month','ba003',"iyear", "imonth", 'marital_status', "education"]]
 
     #居住地
     data_2013 = pd.merge(data_2013, psu[['communityID', 'province', 'city']], on = "communityID", how="left")
@@ -425,6 +447,16 @@ if __name__ == "__main__":
     # (3)Occasionally or a moderate amount of the time (3-4 days) 有时或者说有一半的时间(3-4天) 
     # (4)Most or all of the time (5-7 days) 大多数的时间(5-7天) 
     data_2013["sleep_state"] = health_status['dc015']
+
+    #ADL
+    health_status["db010_score"] = health_status["db010"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
+    health_status["db011_score"] = health_status["db011"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
+    health_status["db012_score"] = health_status["db012"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
+    health_status["db013_score"] = health_status["db013"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
+    health_status["db014_score"] = health_status["db014"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
+    health_status["db015_score"] = health_status["db015"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
+    data_2013["ADL"] = health_status["db010_score"] + health_status["db011_score"] + health_status["db012_score"] + health_status["db013_score"] + \
+                        health_status["db014_score"] + health_status["db015_score"]
     
     data_2013["wave"] = year
     change_columns(data_2013)
@@ -438,6 +470,7 @@ if __name__ == "__main__":
     biomarkers, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Biomarker.dta")
     health_status, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Health_Status_and_Functioning.dta")
     health_care, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Health_Care_and_Insurance.dta")
+    weight, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Sample_Infor.dta")
 
     #性别#年龄#婚姻状况
     # 1 married or partnered
@@ -460,8 +493,12 @@ if __name__ == "__main__":
 
     # 2015年的出生年
     demo["birth_year"] = demo.apply(lambda x : x["ba004_w3_1"] if x["ba002"]==1 else x["ba002_1"] if x["ba002"]==2 else np.nan, axis=1)
+    demo["birth_month"] = demo.apply(lambda x : x["ba004_w3_2"] if x["ba002"]==1 else x["ba002_2"] if x["ba002"]==2 else np.nan, axis=1)
+    
+    #获取随访时间
+    demo = pd.merge(demo, weight[["ID", "iyear", "imonth"]], on = "ID", how="left")
 
-    data_2015 = demo[['ID','householdID', 'communityID','ba000_w2_3', 'birth_year', 'marital_status', 'education']]
+    data_2015 = demo[['ID','householdID', 'communityID','ba000_w2_3', 'birth_year','birth_month','ba003',"iyear",  "imonth", 'marital_status', 'education']]
 
     #居住地
     data_2015 = pd.merge(data_2015, psu[['communityID', 'province', 'city']], on = "communityID", how="left")
@@ -620,6 +657,16 @@ if __name__ == "__main__":
     # (3)Occasionally or a moderate amount of the time (3-4 days) 有时或者说有一半的时间(3-4天) 
     # (4)Most or all of the time (5-7 days) 大多数的时间(5-7天) 
     data_2015["sleep_state"] = health_status['dc015']
+
+    #ADL
+    health_status["db010_score"] = health_status["db010"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
+    health_status["db011_score"] = health_status["db011"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
+    health_status["db012_score"] = health_status["db012"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
+    health_status["db013_score"] = health_status["db013"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
+    health_status["db014_score"] = health_status["db014"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
+    health_status["db015_score"] = health_status["db015"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
+    data_2015["ADL"] = health_status["db010_score"] + health_status["db011_score"] + health_status["db012_score"] + health_status["db013_score"] + \
+                        health_status["db014_score"] + health_status["db015_score"]
     
     data_2015["wave"] = year
     change_columns(data_2015)
@@ -632,6 +679,7 @@ if __name__ == "__main__":
     health_status, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Health_Status_and_Functioning.dta")
     health_care, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Health_Care_and_Insurance.dta")
     cognition, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Cognition.dta")
+    weight, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Sample_Infor.dta")
 
     #性别#年龄#婚姻状况
     # 1 married or partnered
@@ -648,8 +696,12 @@ if __name__ == "__main__":
 
     # 出生年
     demo["birth_year"] = demo.apply(lambda x : x["ba004_w3_1"] if x["ba005_w4"]==1 else x["ba002_1"] if x["ba005_w4"]==2 else np.nan, axis=1)
+    demo["birth_month"] = demo.apply(lambda x : x["ba004_w3_2"] if x["ba005_w4"]==1 else x["ba002_2"] if x["ba005_w4"]==2 else np.nan, axis=1)
 
-    data_2018 = demo[['ID','householdID', 'communityID','xrgender', 'birth_year', 'marital_status', 'education']]
+    #获取随访时间
+    demo = pd.merge(demo, weight[["ID", "iyear", "imonth"]], on = "ID", how="left")
+
+    data_2018 = demo[['ID','householdID', 'communityID','xrgender', 'birth_year','birth_month','ba003',"iyear",  "imonth", 'marital_status', 'education']]
 
     #居住地
     data_2018 = pd.merge(data_2018, psu[['communityID', 'province', 'city']], on = "communityID", how="left")
@@ -794,6 +846,16 @@ if __name__ == "__main__":
     # (4)Most or all of the time (5-7 days) 大多数的时间(5-7天) 
     data_2018["sleep_state"] = cognition['dc015'].apply(lambda x : np.nan if x > 4 else x) 
     
+    #ADL
+    health_status["db010_score"] = health_status["db010"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
+    health_status["db011_score"] = health_status["db011"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
+    health_status["db012_score"] = health_status["db012"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
+    health_status["db013_score"] = health_status["db013"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
+    health_status["db014_score"] = health_status["db014"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
+    health_status["db015_score"] = health_status["db015"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
+    data_2018["ADL"] = health_status["db010_score"] + health_status["db011_score"] + health_status["db012_score"] + health_status["db013_score"] + \
+                        health_status["db014_score"] + health_status["db015_score"]
+    
     data_2018["wave"] = year
     change_columns(data_2018)
     data_2018 = pd.concat([data_2015, data_2018], axis=0)
@@ -803,6 +865,7 @@ if __name__ == "__main__":
     demo, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Demographic_Background.dta")
     psu, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS2013/PSU.dta", encoding='gbk')
     health_status, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Health_Status_and_Functioning.dta")
+    weight, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Sample_Infor.dta")
 
     #性别#年龄#婚姻状况
     # 1 married or partnered
@@ -824,14 +887,21 @@ if __name__ == "__main__":
 
     # 出生年
     demo["birth_year"] = demo.apply(lambda x : x["ba003_1"] if pd.isna(x["ba003_1"]) else np.nan, axis=1)
+    demo["birth_month"] = demo.apply(lambda x : x["ba003_2"] if pd.isna(x["ba003_2"]) else np.nan, axis=1)
     #合并2018年的出生年
-    birth_year_2018 = data_2018[data_2018["wave"]=="2018"][['ID',"birth_year"]]
+    birth_year_2018 = data_2018[data_2018["wave"]=="2018"][['ID',"birth_year", "birth_month"]]
     # 按 'ID' 列合并两个表
     demo = pd.merge(demo, birth_year_2018, on='ID', how='left', suffixes=("_2020","_2018"))
     # 使用 fillna() 来更新字段
     demo['birth_year'] = demo['birth_year_2020'].fillna(demo['birth_year_2018'])
+    demo['birth_month'] = demo['birth_month_2020'].fillna(demo['birth_month_2018'])
+
+    #获取随访时间
+    demo = pd.merge(demo, weight[["ID", "iyear", "imonth"]], on = "ID", how="left")
 
-    data_2020 = demo[['ID','householdID', 'communityID','xrgender', 'birth_year', 'marital_status', 'education']]
+    demo["ba003"] = 1
+
+    data_2020 = demo[['ID','householdID', 'communityID','xrgender', 'birth_year','birth_month','ba003',"iyear",  "imonth", 'marital_status', 'education']]
     #居住地
     data_2020 = pd.merge(data_2020, psu[['communityID', 'province', 'city']], on = "communityID", how="left")
 
@@ -976,6 +1046,16 @@ if __name__ == "__main__":
     # (4)Most or all of the time (5-7 days) 大多数的时间(5-7天) 
     data_2020["sleep_state"] = health_status['dc022'].apply(lambda x : np.nan if x >900 else x) 
 
+    #ADL
+    health_status["db010_score"] = health_status["db001"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
+    health_status["db011_score"] = health_status["db003"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
+    health_status["db012_score"] = health_status["db005"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
+    health_status["db013_score"] = health_status["db007"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
+    health_status["db014_score"] = health_status["db009"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
+    health_status["db015_score"] = health_status["db011"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
+    data_2020["ADL"] = health_status["db010_score"] + health_status["db011_score"] + health_status["db012_score"] + health_status["db013_score"] + \
+                        health_status["db014_score"] + health_status["db015_score"]
+    
     data_2020["wave"] = year
     change_columns(data_2020)
     data_2020 = pd.concat([data_2018, data_2020], axis=0)
@@ -998,5 +1078,6 @@ if __name__ == "__main__":
     data_2020['city'] = data_2020['city'].replace('巢湖市', '合肥市')
     #襄樊市->襄阳市
     data_2020['city'] = data_2020['city'].replace('襄樊市', '襄阳市') 
+
     data_2020.to_csv("/root/r_base/CHARLS/result_all_new.csv", index=False)
     print(123)

+ 43 - 38
paper_code/code.R

@@ -3,58 +3,63 @@
 library(msm)
 library(survival)
 
-# data <- data.frame(
-#   ID = c(1, 1, 1, 2, 2, 2),        # 人员ID
-#   time = c(0, 1, 2, 0, 1, 2),      # 随访时间
-#   state = c(1, 2, 3, 1, 1, 2),     # 疾病状态
-#   birth_year = c(1970, 1970, 1970, 1980, 1980, 1980),
-#   gender = c(1, 1, 1, 2, 2, 2),    # 性别
-#   education = c(3, 3, 3, 2, 2, 2)  # 教育程度
-# )
-# statetable.msm(state, ID, data = data)
-
-# qmatrix_init <- matrix(c(-0.5, 0.25, 0.25,
-#                          0.1, -0.3, 0.2,
-#                          0, 0, 0), 
-#                        nrow = 3, byrow = TRUE)
-
-# msm_model <- msm(state ~ time, subject = ID, data = data,
-#                  qmatrix = qmatrix_init, 
-#                  covariates = ~ gender + education)
-# pmatrix.msm(msm_model, t = 1)  # t = 1 代表随访之间的间隔时间
-# summary(msm_model)
-
-# 创建数据框
-data <- data.frame(
-  ID = c(1, 1, 1, 2, 2, 2),        # 人员ID
-  time = c(0, 1, 2, 0, 1, 2),      # 随访时间
-  state = c(1, 2, 3, 1, 1, 2),     # 疾病状态
-  birth_year = c(1970, 1970, 1970, 1980, 1980, 1980), # 出生年份
-  gender = c(1, 1, 1, 2, 2, 2),    # 性别
-  education = c(3, 3, 3, 2, 2, 2)  # 教育程度
-)
+# View(cav)
 
+data <- read.csv("paper_data.csv")
+
+# data$age_group <- cut(data$age,
+#                       breaks = c(45, 55, 65, Inf),
+#                       labels = c("45-54", "55-64",">=65"),
+#                       right = FALSE)
+
+data$gender_ <- factor(data$rgender, levels = c(1, 2), labels = c("男", "女"))
+
+summary(data$age_group)
+# View(data[, c("age", "age_group")])
+View(data$age_group)
 # 计算状态转移频数表
 freq_table <- statetable.msm(state, ID, data = data)
 print(freq_table)
 
 # 初始化转移速率矩阵
-qmatrix_init <- matrix(c(-0.5, 0.25, 0.25,
-                          0.1, -0.3, 0.2,
-                          0.3, 0.2, -0.5), 
-                        nrow = 3, byrow = TRUE)
+qmatrix_init <- matrix(c(-0.5, 0.25, 0.15, 0.1,
+                          0.1, -0.3, 0.1, 0.1,
+                          0.3, 0.1, -0.5, 0.1,
+                          0,   0,    0,   0), 
+                        nrow = 4, byrow = TRUE)
 
 # 创建初始模型
-crude_init <- crudeinits.msm(state ~ time+ gender, subject = ID, data = data, qmatrix = qmatrix_init)
+crude_init <- crudeinits.msm(state ~ wave, subject = ID, data = data, qmatrix = qmatrix_init)
+
+View(crude_init)
 
 # 进行多状态模型分析
-msm_model <- msm(state ~ time, subject = ID, data = data,
+msm_model <- msm(state ~ wave, subject = ID, data = data,
                  qmatrix = crude_init,
-                 covariates = ~ gender+education)
+                 covariates = ~ gender_, 
+                 death = 4,
+                 method = "BFGS", control = list(fnscale = 4000, maxit = 10000)
+                 )
 
 # 计算状态转移概率矩阵
-prob_matrix <- pmatrix.msm(msm_model, t = 1)  # t = 1 代表随访之间的间隔时间
+prob_matrix <- pmatrix.msm(msm_model, t = 5)  # t = 1 代表随访之间的间隔时间
 print(prob_matrix)
 
+# 输出拟合模型的速率矩阵
+q_matrix <- qmatrix.msm(msm_model)
+print(q_matrix)
+
+# 提取转移强度
+transition_intensity <- msm_model$qmatrix
+print(transition_intensity)
+
+# 计算在每个状态中的平均逗留时间
+so_journ <- sojourn.msm(msm_model)
+print(so_journ)
+
+# 计算均衡状态概率
+
 # 查看模型的详细结果
 summary(msm_model)
+
+rm(list = ls())

+ 178 - 0
paper_code/data_preprocess.py

@@ -0,0 +1,178 @@
+import pandas as pd
+import numpy as np
+import pyreadstat
+from sklearn.experimental import enable_iterative_imputer
+from sklearn.impute import IterativeImputer
+
+if __name__ == "__main__":
+    CHARLS_data = pd.read_csv("CHARLS_data_pollutants_p_n_m_nd_h.csv")
+
+    cavariates = ["last_year_NO2", 	"before_last_NO2", 	"last_year_O3", "before_last_O3", 
+                        "last_year_pm1",	"before_last_pm1",	"last_year_pm2.5",	"before_last_pm2.5",	"last_year_pm10",
+                    	'before_last_pm10',	'last_year_SO4',	'last_year_NO3',	'last_year_NH4',	'last_year_OM',	'last_year_BC',	'before_last_SO4',
+                    	'before_last_NO3',	'before_last_NH4',	'before_last_OM',	'before_last_BC',	'last_year_nl',	'before_last_nl']
+    #挑出需要的字段
+    data = CHARLS_data[["ID", "rgender", "age", "marital_status", "education", "Physical_activity", "Psychiatric_score", "BMI", "ADL", "Smoke", "Drink",
+                        'Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases', 
+                        'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease', 
+                        'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma', 'wave'
+                        ]+cavariates]
+    #处理共病状态
+    data["state"] = data['Hypertension'] + data['Dyslipidemia']+ data['Disabetes_or_High_Blood_Sugar'] + data['Cancer_or_Malignant_Tumor'] + data['Chronic_Lung_Diseases'] + data['Liver_Disease'] \
+        + data['Heart_Problems'] + data['Stroke'] + data['Kidney_Diease'] + data['Stomach_or_Other_Digestive_Disease'] + data['Emotional_Nervous_or_Psychiatric_Problems'] + data['Memory_Related_Disease'] \
+        + data['Arthritis_or_Rheumatism'] + data['Asthma']
+    data["state"] = data['state'].apply(lambda x : 1 if x == 0 else 2 if x == 1 else 3 if x >= 2 else np.nan)
+
+    data = data.drop(columns=['Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases', 
+                        'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease', 
+                        'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma'])
+    
+    #增加一列死亡状态
+    #0:未死亡
+    #1:死亡 
+    #读取2013年的死亡数据
+    exit, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS2013/Exit_Interview.dta")
+    exit['ID'] = pd.to_numeric(exit['ID'], errors='coerce').astype('Int64')
+    exit["exit_year"] = exit["exb001_1"]
+    data = pd.merge(data, exit[['ID', "exit_year"]], on = "ID", how="left")
+
+    #读取2020年的死亡数据
+    exit, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS2020/Exit_Module.dta")
+    exit['ID'] = pd.to_numeric(exit['ID'], errors='coerce').astype('Int64')
+    exit["exit_year"] = exit["exb001_1"]
+    data = pd.merge(data, exit[['ID', "exit_year"]], on = "ID", how="left")
+    
+    #合并两次死亡数据
+    data["exit_year"] = data["exit_year_x"].fillna(data["exit_year_y"])
+
+    # 定义随访年的列表
+    followup_years = [2011, 2013, 2015, 2018, 2020]
+
+    # 将 'id' 和 'wave' 转换为整型,确保排序不会出问题
+    data['ID'] = data['ID'].astype(int)
+    data['wave'] = data['wave'].astype(int)
+
+    # 找到每个死亡个体的最后一次随访数据
+    last_followup = data.dropna(subset=['exit_year']).groupby('ID').apply(lambda x: x[x['wave'] == x['wave'].max()])
+
+    # 创建一个布尔掩码,用于标记需要将 'state' 修改为 4 的行
+    mask = last_followup['wave'] > last_followup['exit_year']
+
+    # 将 MultiIndex 转换为单一的 ID 索引
+    last_followup_ids = last_followup[mask].index.get_level_values('ID')
+
+    # 使用布尔索引直接在 data 中修改对应行的 'state'
+    data.loc[data['ID'].isin(last_followup_ids) & (data['wave'] == data.groupby('ID')['wave'].transform('max')), 'state'] = 4
+
+    # 创建新的记录并为每个死亡个体设置下一次随访年
+    new_rows = last_followup[last_followup['wave'] <= last_followup['exit_year']].copy()
+    new_rows['wave'] = new_rows['wave'].apply(lambda x: next((year for year in followup_years if year > x), None))
+    new_rows['state'] = 4
+
+    # 将新行添加到原始 DataFrame 中
+    data = pd.concat([data, new_rows], ignore_index=True).sort_values(by=['ID', 'wave']).reset_index(drop=True)
+
+    #删除多余列
+    data = data.drop(columns=["exit_year_x", "exit_year_y", "exit_year"])
+
+    # 统计唯一用户的个数
+    unique_user_count = data['ID'].nunique()
+    print(f"删除空状态前的用户数:{unique_user_count}")
+
+
+
+    # 将状态为空的数据删除
+    # 查找所有有空值 state 的用户 ID
+    users_with_na = data[data['state'].isna()]['ID'].unique()
+
+    # 删除这些用户的所有数据
+    data = data[~data['ID'].isin(users_with_na)]
+
+    unique_user_count = data['ID'].nunique()
+    print(f"删除空状态后的用户数:{unique_user_count}")
+
+
+
+    #获取参加全部批次的纵向数据
+    # 1. 统计每个用户的批次数
+    user_counts = data.groupby('ID')['wave'].count().reset_index()
+    user_counts.columns = ['ID', 'wave_count']
+
+    # 2. 找到每个用户的最大批次的状态
+    max_wave_state = data.loc[data.groupby('ID')['wave'].idxmax()][['ID', 'state']]
+    max_wave_state.columns = ['ID', 'max_wave_state']
+
+    # 3. 将用户的批次数和最大批次状态合并回原始数据
+    data = data.merge(user_counts, on='ID').merge(max_wave_state, on='ID')
+
+    # 4. 筛选满足条件的用户
+    condition_1 = (data['wave_count'] == 5)
+    condition_2 = (data['max_wave_state'] == 4) & (data['wave_count'] > 1)
+    data = data[condition_1 | condition_2]
+
+    # 5. 清除多余的列
+    data = data.drop(columns=['wave_count', 'max_wave_state']).reset_index(drop=True)
+
+    unique_user_count = data['ID'].nunique()
+    print(f"参加全部批次的用户数:{unique_user_count}")
+
+    
+    #删除45岁以下的用户ID
+    users_with_45_age = data[data['age']<45]['ID'].unique()
+
+    # 删除这些用户的所有数据
+    data = data[~data['ID'].isin(users_with_45_age)]
+
+    unique_user_count = data['ID'].nunique()
+    print(f"删除45岁后的用户数:{unique_user_count}")
+
+
+
+    # 查找所有有空值age 的用户 ID
+    users_with_na_age = data[data['age'].isna()]['ID'].unique()
+
+    # 删除这些用户的所有数据
+    data = data[~data['ID'].isin(users_with_na_age)]
+
+    unique_user_count = data['ID'].nunique()
+    print(f"删除空年龄后的用户数:{unique_user_count}")
+
+
+
+    # 查找所有有空值education 的用户 ID
+    users_with_na_education = data[data['education'].isna()]['ID'].unique()
+
+    # 删除这些用户的所有数据
+    data = data[~data['ID'].isin(users_with_na_education)]
+
+    unique_user_count = data['ID'].nunique()
+    print(f"删除空教育后的用户数:{unique_user_count}")
+
+
+
+    #删除异常的BMI
+    users_with_BMI_wr = data[data["BMI"]>=200]['ID'].unique()
+    # 删除这些用户的所有数据
+    data = data[~data['ID'].isin(users_with_BMI_wr)]
+
+    unique_user_count = data['ID'].nunique()
+    print(f"删除异常BMI的用户数:{unique_user_count}")
+
+
+    #多重插补
+    # 初始化多重插补模型
+    imputer = IterativeImputer(max_iter=10, random_state=0)
+
+    # 进行多重插补
+    imputed_data = imputer.fit_transform(data)
+
+    # 将插补后的数据转换为 DataFrame
+    data = pd.DataFrame(imputed_data, columns=data.columns)
+    # 将分类变量列取整
+    categorical_columns = ['Physical_activity', 'Psychiatric_score', 'ADL', 'Smoke', 'Drink']  # 分类变量列名
+    data[categorical_columns] = data[categorical_columns].round().astype(int)
+    # 修正负值,将所有小于 0 的值替换为 0 或指定的最小值
+    data[data < 0] = 0
+
+    #排序将ID相同的放到一起
+    data.to_csv("paper_data.csv", index=False)