1 mês atrás · a4e9339d5c
--- a/CHARLS_P/CHARLS_harmonized.py
+++ b/CHARLS_P/CHARLS_harmonized.py
@@ -1,6 +1,8 @@
 
				 import pandas as pd
			
 
				 import numpy as np
			
 
				 import pyreadstat
			
 
				+from datetime import date
			
 
				+from lunarcalendar import Converter, Lunar
			
 
				 
			
 
				 
			
 
				 # 定义一个函数，用于更新 harmonized 中的 mstat 列
			
@@ -43,6 +45,43 @@ def merge_data(harmonized, waves, flag="other"):
 
				         )[col_name])
			
 
				     return merged_data
			
 
				 
			
 
				+# 通过 groupby 采用少数服从多数原则填充性别
			
 
				+def fill_gender(group, col):
			
 
				+    # 计算性别众数
			
 
				+    mode_gender = group[col].mode()
			
 
				+    if not mode_gender.empty:
			
 
				+        # 用众数替换组内所有性别值
			
 
				+        group[col] = mode_gender[0]
			
 
				+    return group
			
 
				+
			
 
				+def calculate_age(row):
			
 
				+    # 检查空值
			
 
				+    if pd.isnull(row['birth_year']) or pd.isnull(row['birth_month']) or pd.isnull(row['iyear']) or pd.isnull(row['imonth']):
			
 
				+        return np.nan  # 返回 NaN 代表无法计算年龄
			
 
				+
			
 
				+    # 获取出生年月
			
 
				+    birth_year = int(row['birth_year'])
			
 
				+    birth_month = int(row['birth_month'])
			
 
				+    if birth_month == 0:
			
 
				+        birth_month = 6
			
 
				+    # 确定出生日期
			
 
				+    if row['ba003'] == 1:
			
 
				+        # 公历
			
 
				+        birth_date = date(birth_year, birth_month, 1)
			
 
				+    else:
			
 
				+        lunar = Lunar(birth_year, birth_month, 1, isleap=False)
			
 
				+        # 农历
			
 
				+        birth_date = Converter.Lunar2Solar(lunar)
			
 
				+    
			
 
				+    # 获取随访年月
			
 
				+    followup_year = int(row['iyear'])
			
 
				+    followup_month = int(row['imonth'])
			
 
				+    followup_date = date(followup_year, followup_month, 1)
			
 
				+    
			
 
				+    # 计算年龄
			
 
				+    age = followup_date.year - birth_date.year - ((followup_date.month, followup_date.day) < (birth_date.month, birth_date.day))
			
 
				+    return age    
			
 
				+
			
 
				 if __name__ == "__main__":
			
 
				     harmonized, meta = pyreadstat.read_dta("/root/r_base/CHARLS/Harmonized_CHARLS/H_CHARLS_D_Data.dta")
			
 
				     CHARLS_data = pd.read_csv("CHARLS_data_pollutants_p_n_m_nd.csv")
			
@@ -70,7 +109,7 @@ if __name__ == "__main__":
 
				 
			
 
				     #BMI
			
 
				     waves = [(2011, "r1mbmi"), (2013, "r2mbmi"), (2015, "r3mbmi")]
			
 
				-    CHARLS_data["BMI"] = pd.concat(merge_data(harmonized, waves), ignore_index=True)
			
 
				+    CHARLS_data["BMI_m"] = pd.concat(merge_data(harmonized, waves), ignore_index=True)
			
 
				 
			
 
				     #收缩压#舒张压
			
 
				     waves = [(2011, "r1systo"), (2013, "r2systo"), (2015, "r3systo")]
			
@@ -135,6 +174,14 @@ if __name__ == "__main__":
 
				     waves = [(2011, "r1sleeprl"), (2013, "r2sleeprl"), (2015, "r3sleeprl"), (2018, "r4sleeprl")]
			
 
				     CHARLS_data["sleep_state_m"] = pd.concat(merge_data(harmonized, waves), ignore_index=True)
			
 
				 
			
 
				+    # ADL
			
 
				+    waves = [(2011, "s1adlab_c"), (2013, "s2adlab_c"), (2015, "s3adlab_c"), (2018, "s4adlab_c")]
			
 
				+    CHARLS_data["ADL_m"] = pd.concat(merge_data(harmonized, waves), ignore_index=True)
			
 
				+
			
 
				+    #年龄
			
 
				+    waves = [(2011, "r1agey"), (2013, "r2agey"), (2015, "r3agey"), (2018, "r4agey")]
			
 
				+    CHARLS_data["age_m"] = pd.concat(merge_data(harmonized, waves), ignore_index=True)
			
 
				+
			
 
				     #计算认知功能得分，分成三部分：电话问卷9分，词语回忆10分、画图1分
			
 
				     waves = [(2011, "r1orient"), (2013, "r2orient"), (2015, "r3orient"), (2018, "r4orient")]
			
 
				     CHARLS_data["Date_Naming"] = pd.concat(merge_data(harmonized, waves), ignore_index=True)
			
@@ -162,7 +209,10 @@ if __name__ == "__main__":
 
				                   	"Physical_activity_m",	"Smoke_m",	'Drink_m',	'Hypertension_m',	'Disabetes_or_High_Blood_Sugar_m',
			
 
				                     'Cancer_or_Malignant_Tumor_m',	'Chronic_Lung_Diseases_m',	'Heart_Problems_m',	'Emotional_Nervous_or_Psychiatric_Problems_m',
			
 
				                 	'Stroke_m',	'Arthritis_or_Rheumatism_m',	'Dyslipidemia_m',	'Liver_Disease_m',	'Kidney_Diease_m',	'Stomach_or_Other_Digestive_Disease_m',
			
 
				-                	'Asthma_m',	'Memory_Related_Disease_m',	'Psychiatric_score_m',	'sleep_state_m', 'Cognition_score_m']
			
 
				+                	'Asthma_m',	'Memory_Related_Disease_m',	'Psychiatric_score_m',	'sleep_state_m', 'Cognition_score_m', "age_m", "ADL_m"]
			
 
				+    
			
 
				+    #先处理身高1~2单位为米，大于3为cm
			
 
				+    CHARLS_data['Height'] = CHARLS_data['Height'].apply(lambda x: x if 1 <= x <= 2 else (x / 100 if x > 3 else x))
			
 
				     
			
 
				     # 遍历 merge_list 列表
			
 
				     for col_m in merge_list:
			
@@ -170,6 +220,10 @@ if __name__ == "__main__":
 
				         if col in CHARLS_data.columns and col_m in CHARLS_data.columns:
			
 
				             CHARLS_data[col] = CHARLS_data[col_m].fillna(CHARLS_data[col])
			
 
				 
			
 
				+    # 计算BMI
			
 
				+    CHARLS_data['BMI'] = CHARLS_data['Weight'] /(CHARLS_data['Height'] * CHARLS_data['Height'])
			
 
				+    CHARLS_data['BMI'] = CHARLS_data["BMI_m"].fillna(CHARLS_data['BMI'])
			
 
				+
			
 
				     # 处理慢性病标准不一样，将2变为0
			
 
				     chronic_disease = ['Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases', 
			
 
				                   'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease', 
			
@@ -183,5 +237,20 @@ if __name__ == "__main__":
 
				         if col in CHARLS_data.columns and col_m in CHARLS_data.columns:
			
 
				             CHARLS_data[col] = CHARLS_data[col_m].fillna(CHARLS_data[col])
			
 
				 
			
 
				-    CHARLS_data = CHARLS_data.drop(columns=["Date_Naming", "Immediate_Word_Recall", "Delayed_Word_Recall", "Serial_7", "Drawing_Picture"] + merge_list+ common_new_list)
			
 
				+    CHARLS_data = CHARLS_data.drop(columns=["Date_Naming", "Immediate_Word_Recall", "Delayed_Word_Recall", "Serial_7", "Drawing_Picture", "BMI_m"] + merge_list+ common_new_list)
			
 
				+    
			
 
				+    #处理性别
			
 
				+    CHARLS_data = CHARLS_data.groupby('ID').apply(lambda group: fill_gender(group, "rgender")).reset_index(drop=True)
			
 
				+
			
 
				+    #处理出生年月
			
 
				+    CHARLS_data = CHARLS_data.groupby('ID').apply(lambda group: fill_gender(group, "birth_year")).reset_index(drop=True)
			
 
				+    CHARLS_data = CHARLS_data.groupby('ID').apply(lambda group: fill_gender(group, "birth_month")).reset_index(drop=True)
			
 
				+
			
 
				+    #处理教育
			
 
				+    CHARLS_data = CHARLS_data.groupby('ID').apply(lambda group: fill_gender(group, "education")).reset_index(drop=True)
			
 
				+    
			
 
				+    #重新计算年龄
			
 
				+    CHARLS_data["age"] = CHARLS_data.apply(calculate_age, axis=1)
			
 
				+
			
 
				+    
			
 
				     CHARLS_data.to_csv("CHARLS_data_pollutants_p_n_m_nd_h.csv", index=False)
			
--- a/CHARLS_P/CHARLS_preprocess_main.py
+++ b/CHARLS_P/CHARLS_preprocess_main.py
@@ -1,10 +1,12 @@
 
				 import pandas as pd
			
 
				 import numpy as np
			
 
				 import pyreadstat
			
 
				+from datetime import date
			
 
				+from lunarcalendar import Converter, Lunar
			
 
				 
			
 
				 #统一列名
			
 
				 def change_columns(df):
			
 
				-    df.columns = ["ID",'householdID','communityID','rgender', "birth_year", "marital_status" , "education", 'province', 'city',"Height", "Weight",
			
 
				+    df.columns = ["ID",'householdID','communityID','rgender', "birth_year", "birth_month", "ba003", "iyear", "imonth", "marital_status" , "education", 'province', 'city',"Height", "Weight",
			
 
				                   "waist", "Systolic","Diastolic",
			
 
				 
			
 
				                   'bl_wbc','bl_mcv','bl_plt','bl_bun','bl_glu','bl_crea','bl_cho', 'bl_tg', 'bl_hdl', 'bl_ldl','bl_crp', 
			
@@ -18,7 +20,7 @@ def change_columns(df):
 
				                   
			
 
				                   'Smoke','Drink',
			
 
				                   
			
 
				-                  "Cognition_score", "Psychiatric_score","sleep_state", "wave"
			
 
				+                  "Cognition_score", "Psychiatric_score","sleep_state", "ADL", "wave",
			
 
				                   ]
			
 
				 # 2020年把帕金森和记忆病症分开，需要和以前对齐   
			
 
				 def process_row(row):
			
@@ -54,7 +56,7 @@ if __name__ == "__main__":
 
				     health_status, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/health_status_and_functioning.dta")
			
 
				     health_care, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/health_care_and_insurance.dta")
			
 
				     exp_income, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/exp_income_wealth.dta")
			
 
				-
			
 
				+    weight, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/weight.dta")
			
 
				     #性别#年龄#居住地#婚姻状况
			
 
				     # 1 married or partnered
			
 
				     # 0 other marital status (separated, divorced, unmarried, or widowed)
			
@@ -65,8 +67,11 @@ if __name__ == "__main__":
 
				     # 1 high school
			
 
				     # 2 college or above
			
 
				     demo["education"] = demo["bd001"].apply(lambda x : 1 if x == 6 or x == 7 else 2 if x in [8, 9, 10, 11] else 0 if x in [1,2,3,4,5] else np.nan)
			
 
				+    
			
 
				+    #获取随访时间
			
 
				+    demo = pd.merge(demo, weight[["ID", "iyear", "imonth"]], on = "ID", how="left")
			
 
				 
			
 
				-    data_2011 = demo[['ID','householdID', 'communityID','rgender','ba002_1','marital_status', 'education']]
			
 
				+    data_2011 = demo[['ID','householdID', 'communityID','rgender','ba002_1', 'ba002_2','ba003',"iyear", "imonth" ,'marital_status', 'education']]
			
 
				 
			
 
				     #居住地
			
 
				     data_2011 = pd.merge(data_2011, psu[['communityID', 'province', 'city']], on = "communityID", how="left")
			
@@ -205,6 +210,16 @@ if __name__ == "__main__":
 
				     # (4)Most or all of the time (5-7 days) 大多数的时间（5-7天） 
			
 
				     data_2011["sleep_state"] = health_status['dc015']
			
 
				 
			
 
				+    #ADL
			
 
				+    health_status["db010_score"] = health_status["db010"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
			
 
				+    health_status["db011_score"] = health_status["db011"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
			
 
				+    health_status["db012_score"] = health_status["db012"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
			
 
				+    health_status["db013_score"] = health_status["db013"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
			
 
				+    health_status["db014_score"] = health_status["db014"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
			
 
				+    health_status["db015_score"] = health_status["db015"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
			
 
				+    data_2011["ADL"] = health_status["db010_score"] + health_status["db011_score"] + health_status["db012_score"] + health_status["db013_score"] + \
			
 
				+                        health_status["db014_score"] + health_status["db015_score"]
			
 
				+
			
 
				     data_2011["wave"] = year
			
 
				     change_columns(data_2011)
			
 
				     # 2011年的ID和其他年份有一点区别，倒数第三位加0
			
@@ -218,6 +233,7 @@ if __name__ == "__main__":
 
				     health_status, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Health_Status_and_Functioning.dta")
			
 
				     health_care, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Health_Care_and_Insurance.dta")
			
 
				     exp_income, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/exp_income_wealth.dta")
			
 
				+    weight, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Weights.dta")
			
 
				 
			
 
				     #性别#年龄#婚姻状况
			
 
				     # 1 married or partnered
			
@@ -252,21 +268,27 @@ if __name__ == "__main__":
 
				 
			
 
				     # 纠正2011年统计错误的出生年
			
 
				     demo["birth_year"] = demo.apply(lambda x : x["ba002_1"] if not pd.isna(x["ba002_1"]) else np.nan, axis=1)
			
 
				-    birth_year_2013 = demo[['ID',"birth_year"]]
			
 
				+    demo["birth_month"] = demo.apply(lambda x : x["ba002_2"] if not pd.isna(x["ba002_2"]) else np.nan, axis=1)
			
 
				+    birth_year_2013 = demo[['ID',"birth_year", "birth_month"]]
			
 
				     # 按 'ID' 列合并两个表
			
 
				     data_2011 = pd.merge(data_2011, birth_year_2013, on='ID', how='left', suffixes=("_2011","_2013"))
			
 
				     # 使用 fillna() 来更新字段
			
 
				     data_2011['birth_year'] = data_2011['birth_year_2013'].fillna(data_2011['birth_year_2011'])
			
 
				+    data_2011['birth_month'] = data_2011['birth_month_2013'].fillna(data_2011['birth_month_2011'])
			
 
				     # 删除多余的列
			
 
				-    data_2011 = data_2011.drop(columns=['birth_year_2013', 'birth_year_2011'])
			
 
				+    data_2011 = data_2011.drop(columns=['birth_year_2013', 'birth_year_2011', 'birth_month_2013', 'birth_month_2011'])
			
 
				     #合并2011年的出生年
			
 
				-    birth_year_2011 = data_2011[['ID',"birth_year"]]
			
 
				+    birth_year_2011 = data_2011[['ID',"birth_year", "birth_month"]]
			
 
				     # 按 'ID' 列合并两个表
			
 
				     demo = pd.merge(demo, birth_year_2011, on='ID', how='left', suffixes=("_2013","_2011"))
			
 
				     # 使用 fillna() 来更新字段
			
 
				     demo['birth_year'] = demo['birth_year_2013'].fillna(demo['birth_year_2011'])
			
 
				+    demo['birth_month'] = demo['birth_month_2013'].fillna(demo['birth_month_2011'])
			
 
				+
			
 
				+    #获取随访时间
			
 
				+    demo = pd.merge(demo, weight[["ID", "iyear", "imonth"]], on = "ID", how="left")
			
 
				 
			
 
				-    data_2013 = demo[['ID','householdID', 'communityID','ba000_w2_3','birth_year','marital_status', "education"]]
			
 
				+    data_2013 = demo[['ID','householdID', 'communityID','ba000_w2_3','birth_year','birth_month','ba003',"iyear", "imonth", 'marital_status', "education"]]
			
 
				 
			
 
				     #居住地
			
 
				     data_2013 = pd.merge(data_2013, psu[['communityID', 'province', 'city']], on = "communityID", how="left")
			
@@ -425,6 +447,16 @@ if __name__ == "__main__":
 
				     # (3)Occasionally or a moderate amount of the time (3-4 days)　有时或者说有一半的时间（3-4天） 
			
 
				     # (4)Most or all of the time (5-7 days) 大多数的时间（5-7天） 
			
 
				     data_2013["sleep_state"] = health_status['dc015']
			
 
				+
			
 
				+    #ADL
			
 
				+    health_status["db010_score"] = health_status["db010"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
			
 
				+    health_status["db011_score"] = health_status["db011"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
			
 
				+    health_status["db012_score"] = health_status["db012"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
			
 
				+    health_status["db013_score"] = health_status["db013"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
			
 
				+    health_status["db014_score"] = health_status["db014"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
			
 
				+    health_status["db015_score"] = health_status["db015"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
			
 
				+    data_2013["ADL"] = health_status["db010_score"] + health_status["db011_score"] + health_status["db012_score"] + health_status["db013_score"] + \
			
 
				+                        health_status["db014_score"] + health_status["db015_score"]
			
 
				     
			
 
				     data_2013["wave"] = year
			
 
				     change_columns(data_2013)
			
@@ -438,6 +470,7 @@ if __name__ == "__main__":
 
				     biomarkers, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Biomarker.dta")
			
 
				     health_status, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Health_Status_and_Functioning.dta")
			
 
				     health_care, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Health_Care_and_Insurance.dta")
			
 
				+    weight, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Sample_Infor.dta")
			
 
				 
			
 
				     #性别#年龄#婚姻状况
			
 
				     # 1 married or partnered
			
@@ -460,8 +493,12 @@ if __name__ == "__main__":
 
				 
			
 
				     # 2015年的出生年
			
 
				     demo["birth_year"] = demo.apply(lambda x : x["ba004_w3_1"] if x["ba002"]==1 else x["ba002_1"] if x["ba002"]==2 else np.nan, axis=1)
			
 
				+    demo["birth_month"] = demo.apply(lambda x : x["ba004_w3_2"] if x["ba002"]==1 else x["ba002_2"] if x["ba002"]==2 else np.nan, axis=1)
			
 
				+    
			
 
				+    #获取随访时间
			
 
				+    demo = pd.merge(demo, weight[["ID", "iyear", "imonth"]], on = "ID", how="left")
			
 
				 
			
 
				-    data_2015 = demo[['ID','householdID', 'communityID','ba000_w2_3', 'birth_year', 'marital_status', 'education']]
			
 
				+    data_2015 = demo[['ID','householdID', 'communityID','ba000_w2_3', 'birth_year','birth_month','ba003',"iyear",  "imonth", 'marital_status', 'education']]
			
 
				 
			
 
				     #居住地
			
 
				     data_2015 = pd.merge(data_2015, psu[['communityID', 'province', 'city']], on = "communityID", how="left")
			
@@ -620,6 +657,16 @@ if __name__ == "__main__":
 
				     # (3)Occasionally or a moderate amount of the time (3-4 days)　有时或者说有一半的时间（3-4天） 
			
 
				     # (4)Most or all of the time (5-7 days) 大多数的时间（5-7天） 
			
 
				     data_2015["sleep_state"] = health_status['dc015']
			
 
				+
			
 
				+    #ADL
			
 
				+    health_status["db010_score"] = health_status["db010"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
			
 
				+    health_status["db011_score"] = health_status["db011"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
			
 
				+    health_status["db012_score"] = health_status["db012"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
			
 
				+    health_status["db013_score"] = health_status["db013"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
			
 
				+    health_status["db014_score"] = health_status["db014"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
			
 
				+    health_status["db015_score"] = health_status["db015"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
			
 
				+    data_2015["ADL"] = health_status["db010_score"] + health_status["db011_score"] + health_status["db012_score"] + health_status["db013_score"] + \
			
 
				+                        health_status["db014_score"] + health_status["db015_score"]
			
 
				     
			
 
				     data_2015["wave"] = year
			
 
				     change_columns(data_2015)
			
@@ -632,6 +679,7 @@ if __name__ == "__main__":
 
				     health_status, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Health_Status_and_Functioning.dta")
			
 
				     health_care, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Health_Care_and_Insurance.dta")
			
 
				     cognition, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Cognition.dta")
			
 
				+    weight, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Sample_Infor.dta")
			
 
				 
			
 
				     #性别#年龄#婚姻状况
			
 
				     # 1 married or partnered
			
@@ -648,8 +696,12 @@ if __name__ == "__main__":
 
				 
			
 
				     # 出生年
			
 
				     demo["birth_year"] = demo.apply(lambda x : x["ba004_w3_1"] if x["ba005_w4"]==1 else x["ba002_1"] if x["ba005_w4"]==2 else np.nan, axis=1)
			
 
				+    demo["birth_month"] = demo.apply(lambda x : x["ba004_w3_2"] if x["ba005_w4"]==1 else x["ba002_2"] if x["ba005_w4"]==2 else np.nan, axis=1)
			
 
				 
			
 
				-    data_2018 = demo[['ID','householdID', 'communityID','xrgender', 'birth_year', 'marital_status', 'education']]
			
 
				+    #获取随访时间
			
 
				+    demo = pd.merge(demo, weight[["ID", "iyear", "imonth"]], on = "ID", how="left")
			
 
				+
			
 
				+    data_2018 = demo[['ID','householdID', 'communityID','xrgender', 'birth_year','birth_month','ba003',"iyear",  "imonth", 'marital_status', 'education']]
			
 
				 
			
 
				     #居住地
			
 
				     data_2018 = pd.merge(data_2018, psu[['communityID', 'province', 'city']], on = "communityID", how="left")
			
@@ -794,6 +846,16 @@ if __name__ == "__main__":
 
				     # (4)Most or all of the time (5-7 days) 大多数的时间（5-7天） 
			
 
				     data_2018["sleep_state"] = cognition['dc015'].apply(lambda x : np.nan if x > 4 else x) 
			
 
				     
			
 
				+    #ADL
			
 
				+    health_status["db010_score"] = health_status["db010"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
			
 
				+    health_status["db011_score"] = health_status["db011"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
			
 
				+    health_status["db012_score"] = health_status["db012"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
			
 
				+    health_status["db013_score"] = health_status["db013"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
			
 
				+    health_status["db014_score"] = health_status["db014"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
			
 
				+    health_status["db015_score"] = health_status["db015"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
			
 
				+    data_2018["ADL"] = health_status["db010_score"] + health_status["db011_score"] + health_status["db012_score"] + health_status["db013_score"] + \
			
 
				+                        health_status["db014_score"] + health_status["db015_score"]
			
 
				+    
			
 
				     data_2018["wave"] = year
			
 
				     change_columns(data_2018)
			
 
				     data_2018 = pd.concat([data_2015, data_2018], axis=0)
			
@@ -803,6 +865,7 @@ if __name__ == "__main__":
 
				     demo, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Demographic_Background.dta")
			
 
				     psu, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS2013/PSU.dta", encoding='gbk')
			
 
				     health_status, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Health_Status_and_Functioning.dta")
			
 
				+    weight, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Sample_Infor.dta")
			
 
				 
			
 
				     #性别#年龄#婚姻状况
			
 
				     # 1 married or partnered
			
@@ -824,14 +887,21 @@ if __name__ == "__main__":
 
				 
			
 
				     # 出生年
			
 
				     demo["birth_year"] = demo.apply(lambda x : x["ba003_1"] if pd.isna(x["ba003_1"]) else np.nan, axis=1)
			
 
				+    demo["birth_month"] = demo.apply(lambda x : x["ba003_2"] if pd.isna(x["ba003_2"]) else np.nan, axis=1)
			
 
				     #合并2018年的出生年
			
 
				-    birth_year_2018 = data_2018[data_2018["wave"]=="2018"][['ID',"birth_year"]]
			
 
				+    birth_year_2018 = data_2018[data_2018["wave"]=="2018"][['ID',"birth_year", "birth_month"]]
			
 
				     # 按 'ID' 列合并两个表
			
 
				     demo = pd.merge(demo, birth_year_2018, on='ID', how='left', suffixes=("_2020","_2018"))
			
 
				     # 使用 fillna() 来更新字段
			
 
				     demo['birth_year'] = demo['birth_year_2020'].fillna(demo['birth_year_2018'])
			
 
				+    demo['birth_month'] = demo['birth_month_2020'].fillna(demo['birth_month_2018'])
			
 
				+
			
 
				+    #获取随访时间
			
 
				+    demo = pd.merge(demo, weight[["ID", "iyear", "imonth"]], on = "ID", how="left")
			
 
				 
			
 
				-    data_2020 = demo[['ID','householdID', 'communityID','xrgender', 'birth_year', 'marital_status', 'education']]
			
 
				+    demo["ba003"] = 1
			
 
				+
			
 
				+    data_2020 = demo[['ID','householdID', 'communityID','xrgender', 'birth_year','birth_month','ba003',"iyear",  "imonth", 'marital_status', 'education']]
			
 
				     #居住地
			
 
				     data_2020 = pd.merge(data_2020, psu[['communityID', 'province', 'city']], on = "communityID", how="left")
			
 
				 
			
@@ -976,6 +1046,16 @@ if __name__ == "__main__":
 
				     # (4)Most or all of the time (5-7 days) 大多数的时间（5-7天） 
			
 
				     data_2020["sleep_state"] = health_status['dc022'].apply(lambda x : np.nan if x >900 else x) 
			
 
				 
			
 
				+    #ADL
			
 
				+    health_status["db010_score"] = health_status["db001"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
			
 
				+    health_status["db011_score"] = health_status["db003"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
			
 
				+    health_status["db012_score"] = health_status["db005"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
			
 
				+    health_status["db013_score"] = health_status["db007"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
			
 
				+    health_status["db014_score"] = health_status["db009"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
			
 
				+    health_status["db015_score"] = health_status["db011"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
			
 
				+    data_2020["ADL"] = health_status["db010_score"] + health_status["db011_score"] + health_status["db012_score"] + health_status["db013_score"] + \
			
 
				+                        health_status["db014_score"] + health_status["db015_score"]
			
 
				+    
			
 
				     data_2020["wave"] = year
			
 
				     change_columns(data_2020)
			
 
				     data_2020 = pd.concat([data_2018, data_2020], axis=0)
			
@@ -998,5 +1078,6 @@ if __name__ == "__main__":
 
				     data_2020['city'] = data_2020['city'].replace('巢湖市', '合肥市')
			
 
				     #襄樊市->襄阳市
			
 
				     data_2020['city'] = data_2020['city'].replace('襄樊市', '襄阳市') 
			
 
				+
			
 
				     data_2020.to_csv("/root/r_base/CHARLS/result_all_new.csv", index=False)
			
 
				     print(123)
			
--- a/paper_code/code.R
+++ b/paper_code/code.R
@@ -3,58 +3,63 @@
 
				 library(msm)
			
 
				 library(survival)
			
 
				 
			
 
				-# data <- data.frame(
			
 
				-#   ID = c(1, 1, 1, 2, 2, 2),        # 人员ID
			
 
				-#   time = c(0, 1, 2, 0, 1, 2),      # 随访时间
			
 
				-#   state = c(1, 2, 3, 1, 1, 2),     # 疾病状态
			
 
				-#   birth_year = c(1970, 1970, 1970, 1980, 1980, 1980),
			
 
				-#   gender = c(1, 1, 1, 2, 2, 2),    # 性别
			
 
				-#   education = c(3, 3, 3, 2, 2, 2)  # 教育程度
			
 
				-# )
			
 
				-# statetable.msm(state, ID, data = data)
			
 
				-
			
 
				-# qmatrix_init <- matrix(c(-0.5, 0.25, 0.25,
			
 
				-#                          0.1, -0.3, 0.2,
			
 
				-#                          0, 0, 0), 
			
 
				-#                        nrow = 3, byrow = TRUE)
			
 
				-
			
 
				-# msm_model <- msm(state ~ time, subject = ID, data = data,
			
 
				-#                  qmatrix = qmatrix_init, 
			
 
				-#                  covariates = ~ gender + education)
			
 
				-# pmatrix.msm(msm_model, t = 1)  # t = 1 代表随访之间的间隔时间
			
 
				-# summary(msm_model)
			
 
				-
			
 
				-# 创建数据框
			
 
				-data <- data.frame(
			
 
				-  ID = c(1, 1, 1, 2, 2, 2),        # 人员ID
			
 
				-  time = c(0, 1, 2, 0, 1, 2),      # 随访时间
			
 
				-  state = c(1, 2, 3, 1, 1, 2),     # 疾病状态
			
 
				-  birth_year = c(1970, 1970, 1970, 1980, 1980, 1980), # 出生年份
			
 
				-  gender = c(1, 1, 1, 2, 2, 2),    # 性别
			
 
				-  education = c(3, 3, 3, 2, 2, 2)  # 教育程度
			
 
				-)
			
 
				+# View(cav)
			
 
				 
			
 
				+data <- read.csv("paper_data.csv")
			
 
				+
			
 
				+# data$age_group <- cut(data$age,
			
 
				+#                       breaks = c(45, 55, 65, Inf),
			
 
				+#                       labels = c("45-54", "55-64",">=65"),
			
 
				+#                       right = FALSE)
			
 
				+
			
 
				+data$gender_ <- factor(data$rgender, levels = c(1, 2), labels = c("男", "女"))
			
 
				+
			
 
				+summary(data$age_group)
			
 
				+# View(data[, c("age", "age_group")])
			
 
				+View(data$age_group)
			
 
				 # 计算状态转移频数表
			
 
				 freq_table <- statetable.msm(state, ID, data = data)
			
 
				 print(freq_table)
			
 
				 
			
 
				 # 初始化转移速率矩阵
			
 
				-qmatrix_init <- matrix(c(-0.5, 0.25, 0.25,
			
 
				-                          0.1, -0.3, 0.2,
			
 
				-                          0.3, 0.2, -0.5), 
			
 
				-                        nrow = 3, byrow = TRUE)
			
 
				+qmatrix_init <- matrix(c(-0.5, 0.25, 0.15, 0.1,
			
 
				+                          0.1, -0.3, 0.1, 0.1,
			
 
				+                          0.3, 0.1, -0.5, 0.1,
			
 
				+                          0,   0,    0,   0), 
			
 
				+                        nrow = 4, byrow = TRUE)
			
 
				 
			
 
				 # 创建初始模型
			
 
				-crude_init <- crudeinits.msm(state ~ time+ gender, subject = ID, data = data, qmatrix = qmatrix_init)
			
 
				+crude_init <- crudeinits.msm(state ~ wave, subject = ID, data = data, qmatrix = qmatrix_init)
			
 
				+
			
 
				+View(crude_init)
			
 
				 
			
 
				 # 进行多状态模型分析
			
 
				-msm_model <- msm(state ~ time, subject = ID, data = data,
			
 
				+msm_model <- msm(state ~ wave, subject = ID, data = data,
			
 
				                  qmatrix = crude_init,
			
 
				-                 covariates = ~ gender+education)
			
 
				+                 covariates = ~ gender_, 
			
 
				+                 death = 4,
			
 
				+                 method = "BFGS", control = list(fnscale = 4000, maxit = 10000)
			
 
				+                 )
			
 
				 
			
 
				 # 计算状态转移概率矩阵
			
 
				-prob_matrix <- pmatrix.msm(msm_model, t = 1)  # t = 1 代表随访之间的间隔时间
			
 
				+prob_matrix <- pmatrix.msm(msm_model, t = 5)  # t = 1 代表随访之间的间隔时间
			
 
				 print(prob_matrix)
			
 
				 
			
 
				+# 输出拟合模型的速率矩阵
			
 
				+q_matrix <- qmatrix.msm(msm_model)
			
 
				+print(q_matrix)
			
 
				+
			
 
				+# 提取转移强度
			
 
				+transition_intensity <- msm_model$qmatrix
			
 
				+print(transition_intensity)
			
 
				+
			
 
				+# 计算在每个状态中的平均逗留时间
			
 
				+so_journ <- sojourn.msm(msm_model)
			
 
				+print(so_journ)
			
 
				+
			
 
				+# 计算均衡状态概率
			
 
				+
			
 
				 # 查看模型的详细结果
			
 
				 summary(msm_model)
			
 
				+
			
 
				+rm(list = ls())
			
--- a/paper_code/data_preprocess.py
+++ b/paper_code/data_preprocess.py
@@ -0,0 +1,178 @@
 
				+import pandas as pd
			
 
				+import numpy as np
			
 
				+import pyreadstat
			
 
				+from sklearn.experimental import enable_iterative_imputer
			
 
				+from sklearn.impute import IterativeImputer
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    CHARLS_data = pd.read_csv("CHARLS_data_pollutants_p_n_m_nd_h.csv")
			
 
				+
			
 
				+    cavariates = ["last_year_NO2", 	"before_last_NO2", 	"last_year_O3", "before_last_O3", 
			
 
				+                        "last_year_pm1",	"before_last_pm1",	"last_year_pm2.5",	"before_last_pm2.5",	"last_year_pm10",
			
 
				+                    	'before_last_pm10',	'last_year_SO4',	'last_year_NO3',	'last_year_NH4',	'last_year_OM',	'last_year_BC',	'before_last_SO4',
			
 
				+                    	'before_last_NO3',	'before_last_NH4',	'before_last_OM',	'before_last_BC',	'last_year_nl',	'before_last_nl']
			
 
				+    #挑出需要的字段
			
 
				+    data = CHARLS_data[["ID", "rgender", "age", "marital_status", "education", "Physical_activity", "Psychiatric_score", "BMI", "ADL", "Smoke", "Drink",
			
 
				+                        'Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases', 
			
 
				+                        'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease', 
			
 
				+                        'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma', 'wave'
			
 
				+                        ]+cavariates]
			
 
				+    #处理共病状态
			
 
				+    data["state"] = data['Hypertension'] + data['Dyslipidemia']+ data['Disabetes_or_High_Blood_Sugar'] + data['Cancer_or_Malignant_Tumor'] + data['Chronic_Lung_Diseases'] + data['Liver_Disease'] \
			
 
				+        + data['Heart_Problems'] + data['Stroke'] + data['Kidney_Diease'] + data['Stomach_or_Other_Digestive_Disease'] + data['Emotional_Nervous_or_Psychiatric_Problems'] + data['Memory_Related_Disease'] \
			
 
				+        + data['Arthritis_or_Rheumatism'] + data['Asthma']
			
 
				+    data["state"] = data['state'].apply(lambda x : 1 if x == 0 else 2 if x == 1 else 3 if x >= 2 else np.nan)
			
 
				+
			
 
				+    data = data.drop(columns=['Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases', 
			
 
				+                        'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease', 
			
 
				+                        'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma'])
			
 
				+    
			
 
				+    #增加一列死亡状态
			
 
				+    #0：未死亡
			
 
				+    #1：死亡 
			
 
				+    #读取2013年的死亡数据
			
 
				+    exit, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS2013/Exit_Interview.dta")
			
 
				+    exit['ID'] = pd.to_numeric(exit['ID'], errors='coerce').astype('Int64')
			
 
				+    exit["exit_year"] = exit["exb001_1"]
			
 
				+    data = pd.merge(data, exit[['ID', "exit_year"]], on = "ID", how="left")
			
 
				+
			
 
				+    #读取2020年的死亡数据
			
 
				+    exit, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS2020/Exit_Module.dta")
			
 
				+    exit['ID'] = pd.to_numeric(exit['ID'], errors='coerce').astype('Int64')
			
 
				+    exit["exit_year"] = exit["exb001_1"]
			
 
				+    data = pd.merge(data, exit[['ID', "exit_year"]], on = "ID", how="left")
			
 
				+    
			
 
				+    #合并两次死亡数据
			
 
				+    data["exit_year"] = data["exit_year_x"].fillna(data["exit_year_y"])
			
 
				+
			
 
				+    # 定义随访年的列表
			
 
				+    followup_years = [2011, 2013, 2015, 2018, 2020]
			
 
				+
			
 
				+    # 将 'id' 和 'wave' 转换为整型，确保排序不会出问题
			
 
				+    data['ID'] = data['ID'].astype(int)
			
 
				+    data['wave'] = data['wave'].astype(int)
			
 
				+
			
 
				+    # 找到每个死亡个体的最后一次随访数据
			
 
				+    last_followup = data.dropna(subset=['exit_year']).groupby('ID').apply(lambda x: x[x['wave'] == x['wave'].max()])
			
 
				+
			
 
				+    # 创建一个布尔掩码，用于标记需要将 'state' 修改为 4 的行
			
 
				+    mask = last_followup['wave'] > last_followup['exit_year']
			
 
				+
			
 
				+    # 将 MultiIndex 转换为单一的 ID 索引
			
 
				+    last_followup_ids = last_followup[mask].index.get_level_values('ID')
			
 
				+
			
 
				+    # 使用布尔索引直接在 data 中修改对应行的 'state'
			
 
				+    data.loc[data['ID'].isin(last_followup_ids) & (data['wave'] == data.groupby('ID')['wave'].transform('max')), 'state'] = 4
			
 
				+
			
 
				+    # 创建新的记录并为每个死亡个体设置下一次随访年
			
 
				+    new_rows = last_followup[last_followup['wave'] <= last_followup['exit_year']].copy()
			
 
				+    new_rows['wave'] = new_rows['wave'].apply(lambda x: next((year for year in followup_years if year > x), None))
			
 
				+    new_rows['state'] = 4
			
 
				+
			
 
				+    # 将新行添加到原始 DataFrame 中
			
 
				+    data = pd.concat([data, new_rows], ignore_index=True).sort_values(by=['ID', 'wave']).reset_index(drop=True)
			
 
				+
			
 
				+    #删除多余列
			
 
				+    data = data.drop(columns=["exit_year_x", "exit_year_y", "exit_year"])
			
 
				+
			
 
				+    # 统计唯一用户的个数
			
 
				+    unique_user_count = data['ID'].nunique()
			
 
				+    print(f"删除空状态前的用户数：{unique_user_count}")
			
 
				+
			
 
				+
			
 
				+
			
 
				+    # 将状态为空的数据删除
			
 
				+    # 查找所有有空值 state 的用户 ID
			
 
				+    users_with_na = data[data['state'].isna()]['ID'].unique()
			
 
				+
			
 
				+    # 删除这些用户的所有数据
			
 
				+    data = data[~data['ID'].isin(users_with_na)]
			
 
				+
			
 
				+    unique_user_count = data['ID'].nunique()
			
 
				+    print(f"删除空状态后的用户数：{unique_user_count}")
			
 
				+
			
 
				+
			
 
				+
			
 
				+    #获取参加全部批次的纵向数据
			
 
				+    # 1. 统计每个用户的批次数
			
 
				+    user_counts = data.groupby('ID')['wave'].count().reset_index()
			
 
				+    user_counts.columns = ['ID', 'wave_count']
			
 
				+
			
 
				+    # 2. 找到每个用户的最大批次的状态
			
 
				+    max_wave_state = data.loc[data.groupby('ID')['wave'].idxmax()][['ID', 'state']]
			
 
				+    max_wave_state.columns = ['ID', 'max_wave_state']
			
 
				+
			
 
				+    # 3. 将用户的批次数和最大批次状态合并回原始数据
			
 
				+    data = data.merge(user_counts, on='ID').merge(max_wave_state, on='ID')
			
 
				+
			
 
				+    # 4. 筛选满足条件的用户
			
 
				+    condition_1 = (data['wave_count'] == 5)
			
 
				+    condition_2 = (data['max_wave_state'] == 4) & (data['wave_count'] > 1)
			
 
				+    data = data[condition_1 | condition_2]
			
 
				+
			
 
				+    # 5. 清除多余的列
			
 
				+    data = data.drop(columns=['wave_count', 'max_wave_state']).reset_index(drop=True)
			
 
				+
			
 
				+    unique_user_count = data['ID'].nunique()
			
 
				+    print(f"参加全部批次的用户数：{unique_user_count}")
			
 
				+
			
 
				+    
			
 
				+    #删除45岁以下的用户ID
			
 
				+    users_with_45_age = data[data['age']<45]['ID'].unique()
			
 
				+
			
 
				+    # 删除这些用户的所有数据
			
 
				+    data = data[~data['ID'].isin(users_with_45_age)]
			
 
				+
			
 
				+    unique_user_count = data['ID'].nunique()
			
 
				+    print(f"删除45岁后的用户数：{unique_user_count}")
			
 
				+
			
 
				+
			
 
				+
			
 
				+    # 查找所有有空值age 的用户 ID
			
 
				+    users_with_na_age = data[data['age'].isna()]['ID'].unique()
			
 
				+
			
 
				+    # 删除这些用户的所有数据
			
 
				+    data = data[~data['ID'].isin(users_with_na_age)]
			
 
				+
			
 
				+    unique_user_count = data['ID'].nunique()
			
 
				+    print(f"删除空年龄后的用户数：{unique_user_count}")
			
 
				+
			
 
				+
			
 
				+
			
 
				+    # 查找所有有空值education 的用户 ID
			
 
				+    users_with_na_education = data[data['education'].isna()]['ID'].unique()
			
 
				+
			
 
				+    # 删除这些用户的所有数据
			
 
				+    data = data[~data['ID'].isin(users_with_na_education)]
			
 
				+
			
 
				+    unique_user_count = data['ID'].nunique()
			
 
				+    print(f"删除空教育后的用户数：{unique_user_count}")
			
 
				+
			
 
				+
			
 
				+
			
 
				+    #删除异常的BMI
			
 
				+    users_with_BMI_wr = data[data["BMI"]>=200]['ID'].unique()
			
 
				+    # 删除这些用户的所有数据
			
 
				+    data = data[~data['ID'].isin(users_with_BMI_wr)]
			
 
				+
			
 
				+    unique_user_count = data['ID'].nunique()
			
 
				+    print(f"删除异常BMI的用户数：{unique_user_count}")
			
 
				+
			
 
				+
			
 
				+    #多重插补
			
 
				+    # 初始化多重插补模型
			
 
				+    imputer = IterativeImputer(max_iter=10, random_state=0)
			
 
				+
			
 
				+    # 进行多重插补
			
 
				+    imputed_data = imputer.fit_transform(data)
			
 
				+
			
 
				+    # 将插补后的数据转换为 DataFrame
			
 
				+    data = pd.DataFrame(imputed_data, columns=data.columns)
			
 
				+    # 将分类变量列取整
			
 
				+    categorical_columns = ['Physical_activity', 'Psychiatric_score', 'ADL', 'Smoke', 'Drink']  # 分类变量列名
			
 
				+    data[categorical_columns] = data[categorical_columns].round().astype(int)
			
 
				+    # 修正负值，将所有小于 0 的值替换为 0 或指定的最小值
			
 
				+    data[data < 0] = 0
			
 
				+
			
 
				+    #排序将ID相同的放到一起
			
 
				+    data.to_csv("paper_data.csv", index=False)