CHARLS_preprocess_main.py 159 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328
  1. import pandas as pd
  2. import numpy as np
  3. import pyreadstat
  4. from datetime import date
  5. from lunarcalendar import Converter, Lunar
  6. #统一列名
  7. def change_columns(df):
  8. df.columns = ["ID",'householdID','communityID','rgender', "birth_year", "birth_month", "ba003", "iyear", "imonth", "marital_status" , "education", 'province', 'city',"urban_nbs","Height", "Weight",
  9. "waist", "Systolic","Diastolic", "Sit_Stand_5x_yes", "Sit_Stand_5x_no",
  10. "Sit_Stand_5x_times", "Walking_Speed_Time_1", "Walking_Speed_Time_2",
  11. 'bl_wbc','bl_mcv','bl_plt','bl_bun','bl_glu','bl_crea','bl_cho', 'bl_tg', 'bl_hdl', 'bl_ldl','bl_crp',
  12. 'bl_hbalc','bl_ua', 'bl_hct', 'bl_hgb','bl_cysc',
  13. 'Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases',
  14. 'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease',
  15. 'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma',
  16. 'Physical_activity',
  17. 'Smoke','Drink', "Accident_Or_Injury","Fell_In_Last2Years", "Menarche_Year", "Menarche_Age", "Menopause_Year", "Menopause_Age",
  18. "Prostate_Issue_Year", "Prostate_Issue_Age", "Wear_Glasses"
  19. , "Average_Sleep_Hours", "Average_Nap_Minutes", "Vigorous_Activity_10Min", "Moderate_Effort_10Min"
  20. , "Walking_10Min", "Vigorous_Activity_Days", "Moderate_Effort_Days", "Walking_Days", "Vigorous_Activity_2Hours_PerDay"
  21. , "Moderate_Effort_2Hours_PerDay", "Walking_2Hours_PerDay", 'Vigorous_Activity_30Min_PerDay'
  22. , "Moderate_Effort_30Min_PerDay", "Walking_30Min_PerDay", "Vigorous_Activity_4Hours_PerDay"
  23. , "Moderate_Effort_4Hours_PerDay", "Walking_4Hours_PerDay", "Reason_For_Vigorous_Activity", "Reason_For_Moderate_Effort", "Reason_For_Walking"
  24. , "Interacted_With_Friends", "Played_Ma_jong", "Provided_help", "Sport", "Community_Related_Organization", "Charity_work", "Training_course", "Other", "None"
  25. , "Interacted_With_Friends_Fr", "Played_Ma_jong_Fr", "Provided_help_Fr", "Sport_Fr", "Community_Related_Organization_Fr", "Charity_work_Fr", "Training_course_Fr", "Other_Fr"
  26. , "Internet_Usage_LastMonth", "Internet_Tools_Desktop_computer","Internet_Tools_Laptop_computer", "Internet_Tools_Tablet_computer", "Internet_Tools_Cellphone", "Internet_Tools_Other"
  27. , "Internet_Purpose_Chat", "Internet_Purpose_news","Internet_Purpose_videos","Internet_Purpose_games","Internet_Purpose_Financial","Internet_Purpose_Others"
  28. , "Mobile_Payment", "Wechat_Usage", "Post_Moments", "Current_Smoking_Status"
  29. , "Smoking_Type_pipe","Smoking_Type_rolled","Smoking_Type_Filtered","Smoking_Type_Unfiltered","Smoking_Type_Cigar","Smoking_Type_Water"
  30. , "Daily_Cigarette_Count", "Drink_PastYear", "Drink_Monthly_Frequency",
  31. 'Heart_attack_2_years', "Recurrent_Stroke",
  32. "Cognition_score", "Psychiatric_score","sleep_state", "ADL",
  33. 'Gas_Connection','Heating_Facility', 'Heating_Energy', 'Cooking_Fuel', "wave",
  34. ]
  35. # 2020年把帕金森和记忆病症分开,需要和以前对齐
  36. def process_row(row):
  37. da002_12_ = row['da003_12_']
  38. da002_13_ = row['da003_13_']
  39. if da002_12_ == 1 or da002_13_ == 1:
  40. return 1
  41. elif da002_12_ == 2 and da002_13_ == 2:
  42. return 2
  43. elif (da002_12_ == 2 and pd.isna(da002_13_)) or (pd.isna(da002_12_) and da002_13_ == 2):
  44. return 2
  45. elif pd.isna(da002_12_) and pd.isna(da002_13_):
  46. return np.nan
  47. else:
  48. return np.nan # 预防万一,其余情况下设为NA
  49. def update_da051(value):
  50. if value == 1:
  51. return 3
  52. elif value == 3:
  53. return 1
  54. else:
  55. return value
  56. if __name__ == "__main__":
  57. # 2011年
  58. year = "2011"
  59. demo, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/demographic_background.dta")
  60. psu, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/psu.dta", encoding='gbk')
  61. biomarkers, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/biomarkers.dta")
  62. blood, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Blood_20140429.dta")
  63. health_status, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/health_status_and_functioning.dta")
  64. health_care, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/health_care_and_insurance.dta")
  65. exp_income, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/exp_income_wealth.dta")
  66. weight, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/weight.dta")
  67. houseing, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/housing_characteristics.dta")
  68. #性别#年龄#居住地#婚姻状况
  69. # 1 married or partnered
  70. # 0 other marital status (separated, divorced, unmarried, or widowed)
  71. demo["marital_status"] = demo.apply(lambda x : 1 if x["be001"]==1 or x["be001"]==2 or x["be002"]==1 else 0 if x["be001"] in [3,4,5,6] else np.nan, axis=1)
  72. #教育
  73. # 0 below high school
  74. # 1 high school
  75. # 2 college or above
  76. demo["education"] = demo["bd001"].apply(lambda x : 1 if x == 6 or x == 7 else 2 if x in [8, 9, 10, 11] else 0 if x in [1,2,3,4,5] else np.nan)
  77. #获取随访时间
  78. demo = pd.merge(demo, weight[["ID", "iyear", "imonth"]], on = "ID", how="left")
  79. data_2011 = demo[['ID','householdID', 'communityID','rgender','ba002_1', 'ba002_2','ba003',"iyear", "imonth" ,'marital_status', 'education']]
  80. #居住地
  81. # 0 农村
  82. # 1 城市
  83. data_2011 = pd.merge(data_2011, psu[['communityID', 'province', 'city', 'urban_nbs']], on = "communityID", how="left")
  84. #身高#体重#收缩压#舒张压
  85. biomarkers["qi002"] = biomarkers["qi002"].apply(lambda x : np.nan if x >210 else x)
  86. biomarkers["ql002"] = biomarkers["ql002"].apply(lambda x : np.nan if x >150 else x)
  87. #腰围
  88. biomarkers['waist'] = biomarkers["qm002"].apply(lambda x : np.nan if x >210 else x)
  89. #血压测量后两次的平均
  90. biomarkers["qa007"] = biomarkers["qa007"].apply(lambda x : np.nan if x >300 else x)
  91. biomarkers["qa011"] = biomarkers["qa011"].apply(lambda x : np.nan if x >300 else x)
  92. biomarkers["qa008"] = biomarkers["qa008"].apply(lambda x : np.nan if x >150 else x)
  93. biomarkers["qa012"] = biomarkers["qa012"].apply(lambda x : np.nan if x >150 else x)
  94. biomarkers["Systolic"] = (biomarkers["qa007"] + biomarkers["qa011"]) /2
  95. biomarkers["Diastolic"] = (biomarkers["qa008"] + biomarkers["qa012"]) /2
  96. #受试者可以在不用手臂支撑的情况下按其平时的节奏连续起立坐下五次吗
  97. # yes
  98. biomarkers["Sit_Stand_5x_yes"] = biomarkers["qh003"]
  99. # no
  100. biomarkers["Sit_Stand_5x_no"] = biomarkers["qh004"]
  101. # times
  102. biomarkers["Sit_Stand_5x_times"] = biomarkers["qh005"]
  103. # 步行速度时间
  104. biomarkers["Walking_Speed_Time_1"] = biomarkers["qg002"]
  105. biomarkers["Walking_Speed_Time_2"] = biomarkers["qg003"]
  106. biomarkers_select = biomarkers[['ID','householdID', 'communityID','qi002','ql002', "waist",'Systolic','Diastolic', "Sit_Stand_5x_yes", "Sit_Stand_5x_no",
  107. "Sit_Stand_5x_times", "Walking_Speed_Time_1", "Walking_Speed_Time_2"]]
  108. data_2011 = pd.merge(data_2011, biomarkers_select, on = ["ID", "householdID", "communityID"], how="left")
  109. #白细胞(WBC),平均红血球容积MCV,血小板,血尿素氮bun,葡萄糖glu,血肌酐crea,总胆固醇cho,甘油三酯tg,高密度脂蛋白HDL,低密度脂蛋白胆固醇LDL,C反应蛋白CRP
  110. #糖化血红蛋白hba1c,尿酸ua,血细胞比容Hematocrit,血红蛋白hgb,胱抑素C
  111. blood = blood.loc[:, blood.columns.difference(["bloodweight", "qc1_va003"])]
  112. data_2011 = pd.merge(data_2011, blood, on = ["ID"], how="left")
  113. # 慢性病:
  114. # (1) Hypertension 高血压病
  115. # (2) Dyslipidemia (elevation of low density lipoprotein, triglycerides (TGs),and total cholesterol, or a low high density lipoprotein level)血脂异常(包括低密度脂蛋白、甘油三酯、总胆固醇的升高或(和)高密度脂蛋白的下降)
  116. # (3) Diabetes or high blood sugar糖尿病或血糖升高(包括糖耐量异常和空腹血糖升高)
  117. # (4) Cancer or malignant tumor (excluding minor skin cancers) 癌症等恶性肿瘤(不包括轻度皮肤癌)
  118. # (5) Chronic lung diseases, such as chronic bronchitis , emphysema ( excluding tumors, or cancer) 慢性肺部疾患如慢性支气管炎或肺气肿、肺心病(不包括肿瘤或癌)
  119. # (6) Liver disease (except fatty liver, tumors, and cancer) 肝脏疾病
  120. # (除脂肪肝、肿瘤或癌外)
  121. # (7) Heart attack, coronary heart disease, angina, congestive heart failure, or other heart problems 心脏病(如心肌梗塞、冠心病、心绞痛、充血性心力衰竭和其他心脏疾病)
  122. # (8) Stroke 中风
  123. # (9) Kidney disease (except for tumor or cancer) 肾脏疾病(不包括肿瘤或癌)
  124. # (10) Stomach or other digestive disease (except for tumor or cancer) 胃部疾病或消化系统疾病(不包括肿瘤或癌)
  125. # (11) Emotional, nervous, or psychiatric problems 情感及精神方面问题
  126. # (12) Memory-related disease 与记忆相关的疾病 (如老年痴呆症、脑萎缩、帕金森症)
  127. # (13) Arthritis or rheumatism 关节炎或风湿病
  128. # (14) Asthma 哮喘
  129. # 体力活动
  130. # 2 vigorous (vigorous activity more than once a week)
  131. # 1 moderate (moderate activity more than once a week)
  132. # 0 inactive (the rest)
  133. health_status["Physical_activity"] = health_status.apply(lambda x : 2 if x["da051_1_"]==1 else
  134. 1 if x["da051_2_"]==1 else
  135. 0 if x["da051_3_"] == 1 or (x["da051_1_"]==2 and x["da051_2_"]==2 and x["da051_3_"] == 2)
  136. else np.nan ,axis=1)
  137. # 抽烟
  138. # 1 抽过烟
  139. # 0 没有抽过烟
  140. health_status["Smoke"] = health_status["da059"].apply(lambda x : 1 if x ==1 else 0 if x == 2 else np.nan)
  141. # 喝酒
  142. # 1 喝过酒
  143. # 0 没有喝过酒
  144. health_status["Drink"] = health_status.apply(lambda x : 1 if x["da067"] ==1 or x["da067"] ==2 else
  145. 0 if x["da069"] == 1 else
  146. 1 if x["da069"] == 2 or x["da069"] == 3 else np.nan, axis=1)
  147. # 您是否经历过交通事故,或任何的重大意外伤害,并接受了治疗?
  148. # 1 是
  149. # 0 否
  150. health_status['Accident_Or_Injury']=health_status["da021"].apply(lambda x : 1 if x ==1 else
  151. 0 if x == 2 else np.nan)
  152. # 过去两年有没有摔倒?
  153. # 1 是
  154. # 0 否
  155. health_status['Fell_In_Last2Years']=health_status["da023"].apply(lambda x : 1 if x ==1 else
  156. 0 if x == 2 else np.nan)
  157. # 您什么时候开始来月经的?(year/age)
  158. health_status['Menarche_Year']=health_status["da026_1"]
  159. health_status['Menarche_Age']=health_status["da026_2"]
  160. # 您什么时候开始绝经的?
  161. health_status['Menopause_Year']=health_status["da028_1"]
  162. health_status['Menopause_Age']=health_status["da028_2"]
  163. # 第一次诊断出您有前列腺疾病是在什么时候?
  164. health_status['Prostate_Issue_Year']=health_status["da030_1"]
  165. health_status['Prostate_Issue_Age']=health_status["da030_2"]
  166. # 是否戴眼镜(包括矫正视力镜片)?
  167. # 1 是
  168. # 0 否
  169. # 2 失明
  170. # 3 偶尔
  171. health_status['Wear_Glasses']=health_status["da032"].apply(lambda x : 1 if x == 1 else 2 if x ==2 else 0 if x == 3 else np.nan)
  172. # 过去一个月内,您平均每天晚上真正睡着的时间大约是几小时?(可能短于您在床上躺着的时间)
  173. health_status['Average_Sleep_Hours']=health_status["da049"]
  174. # 过去一个月内,您通常午睡多长时间?分钟
  175. health_status['Average_Nap_Minutes']=health_status["da050"]
  176. # 您通常每周有没有至少持续做激烈活动十分钟?
  177. health_status['Vigorous_Activity_10Min']=health_status["da051_1_"]
  178. # 您通常每周有没有至少持续做中等强度的体力活动十分钟?
  179. health_status['Moderate_Effort_10Min']=health_status["da051_2_"]
  180. # 您通常每周有没有至少持续走路十分钟?
  181. health_status['Walking_10Min']=health_status["da051_3_"]
  182. # 您通常每周有多少天做[激烈活动]至少十分钟?
  183. health_status['Vigorous_Activity_Days']=health_status["da052_1_"]
  184. # 您通常每周有多少天做[中等强度的体力活动]至少十分钟?
  185. health_status['Moderate_Effort_Days']=health_status["da052_2_"]
  186. # 您通常每周有多少天做[走路]至少十分钟?
  187. health_status['Walking_Days']=health_status["da052_3_"]
  188. # 在做[激烈活动]的这些天里,您一天花多少时间做[激烈活动] 2小时
  189. health_status['Vigorous_Activity_2Hours_PerDay']=health_status["da053_1_"]
  190. # 在做[中等强度的体力活动]的这些天里,您一天花多少时间做[中等强度的体力活动] 2小时
  191. health_status['Moderate_Effort_2Hours_PerDay']=health_status["da053_2_"]
  192. # 在做[走路]的这些天里,您一天花多少时间做[走路] 2小时
  193. health_status['Walking_2Hours_PerDay']=health_status["da053_3_"]
  194. # 在做[激烈活动]的这些天里,您一天花多少时间做[激烈活动] 30分钟
  195. health_status['Vigorous_Activity_30Min_PerDay']=health_status["da054_1_"]
  196. # 在做[中等强度的体力活动]的这些天里,您一天花多少时间做[中等强度的体力活动] 30分钟
  197. health_status['Moderate_Effort_30Min_PerDay']=health_status["da054_2_"]
  198. # 在做[走路]的这些天里,您一天花多少时间做[走路] 30分钟
  199. health_status['Walking_30Min_PerDay']=health_status["da054_3_"]
  200. # 在做[激烈活动]的这些天里,您一天花多少时间做[激烈活动] 4小时
  201. health_status['Vigorous_Activity_4Hours_PerDay']=health_status["da055_1_"]
  202. # 在做[中等强度的体力活动]的这些天里,您一天花多少时间做[中等强度的体力活动] 4小时
  203. health_status['Moderate_Effort_4Hours_PerDay']=health_status["da055_2_"]
  204. # 在做[走路]的这些天里,您一天花多少时间做[走路] 4小时
  205. health_status['Walking_4Hours_PerDay']=health_status["da055_3_"]
  206. # 活动的原因
  207. # 1 工作需要
  208. # 2 娱乐
  209. # 3 体育锻炼
  210. # 4 其他
  211. health_status[["Reason_For_Vigorous_Activity"]]= np.nan
  212. health_status[["Reason_For_Moderate_Effort"]]= np.nan
  213. health_status[["Reason_For_Walking"]]= np.nan
  214. # 过去一个月是否进行了下列社交活动?
  215. # (1) 串门、跟朋友交往
  216. # (2) 打麻将、下棋、打牌、去社区活动室
  217. # (3) 无偿向与您不住在一起的亲人、朋友或者邻居提供帮助
  218. # (4) 去公园或者其他场所跳舞、健身、练气功等
  219. # (5) 参加社团组织活动
  220. # (6) 志愿者活动或者慈善活动/无偿照顾与您不住在一起的病人或残疾人
  221. # (7) 上学或者参加培训课程
  222. # (8)其他
  223. # (9) 以上均没有
  224. health_status["da056s1"] = health_status.apply(lambda x: 1 if x["da056s1"]==1 else 0, axis=1)
  225. health_status["da056s2"] = health_status.apply(lambda x: 2 if x["da056s2"]==2 else 0, axis=1)
  226. health_status["da056s3"] = health_status.apply(lambda x: 3 if x["da056s3"]==3 else 0, axis=1)
  227. health_status["da056s4"] = health_status.apply(lambda x: 4 if x["da056s4"]==4 else 0, axis=1)
  228. health_status["da056s5"] = health_status.apply(lambda x: 5 if x["da056s5"]==5 else 0, axis=1)
  229. health_status["da056s6"] = health_status.apply(lambda x: 6 if x["da056s6"]==6 or x["da056s7"]==7 else 0, axis=1)
  230. health_status["da056s7"] = health_status.apply(lambda x: 7 if x["da056s8"]==8 else 0, axis=1)
  231. health_status["da056s8"] = health_status.apply(lambda x: 8 if x["da056s9"]==9 or x["da056s10"]==10 or x["da056s11"]==11 else 0, axis=1)
  232. health_status["da056s9"] = health_status.apply(lambda x: 7 if x["da056s12"]==12 else 0, axis=1)
  233. # 过去一个月的活动频率
  234. # (1) Almost daily 差不多每天
  235. # (2) Almost every week 差不多每周
  236. # (3) Not regularly 不经常
  237. health_status["da057_6_"] = health_status.apply(lambda x: 1 if x["da057_6_"]==1 or x["da057_7_"]==1 else 2 if x["da057_6_"]==2 or x["da057_7_"]==2 else 3 if x["da057_6_"]==3 or x["da057_7_"]==3 else np.nan, axis=1)
  238. health_status["da057_7_"] = health_status["da057_8_"]
  239. health_status["da057_8_"] = health_status.apply(lambda x: 1 if x["da057_9_"]==1 or x["da057_10_"]==1 or x["da057_11_"]==1
  240. else 2 if x["da057_9_"]==2 or x["da057_10_"]==2 or x["da057_11_"]==2
  241. else 3 if x["da057_9_"]==3 or x["da057_10_"]==3 or x["da057_11_"]==3
  242. else np.nan, axis=1)
  243. # 过去一个月,您是否上网?
  244. # 1 是
  245. # 0 否
  246. health_status["Internet_Usage_LastMonth"] = health_status["da056s10"].apply(lambda x : 1 if x==10 else 0)
  247. # 使用以下哪些工具上网?
  248. health_status[["Internet_Tools_Desktop_computer"]] = np.nan
  249. health_status[["Internet_Tools_Laptop_computer"]] = np.nan
  250. health_status[["Internet_Tools_Tablet_computer"]] = np.nan
  251. health_status[["Internet_Tools_Cellphone"]] = np.nan
  252. health_status[["Internet_Tools_Other"]] = np.nan
  253. # 上网一般做什么?
  254. health_status[["Internet_Purpose_Chat"]] = np.nan
  255. health_status[["Internet_Purpose_news"]] = np.nan
  256. health_status[["Internet_Purpose_videos"]] = np.nan
  257. health_status[["Internet_Purpose_games"]] = np.nan
  258. health_status[["Internet_Purpose_Financial"]] = np.nan
  259. health_status[["Internet_Purpose_Others"]] = np.nan
  260. # 是否会用手机支付
  261. health_status[["Mobile_Payment"]] = np.nan
  262. # 是否使用微信?
  263. health_status[["Wechat_Usage"]] = np.nan
  264. # 发不发微信朋友圈?
  265. health_status[["Post_Moments"]] = np.nan
  266. # 现在还在吸烟还是戒烟了?
  267. # 1 仍然抽烟 Skip DA062 请跳过DA062
  268. # 2 戒烟
  269. health_status['Current_Smoking_Status']=health_status["da061"]
  270. # 吸烟时,一般抽什么烟?
  271. # (1) Smoking a pipe 用烟管吸烟(烟袋、旱烟)
  272. # (2) Smoking self-rolled cigarettes 自己卷烟抽
  273. # (3) Filtered cigarette带滤咀香烟
  274. # (4) Unfiltered cigarette不带滤咀香烟
  275. # (5) Cigar雪茄
  276. # (6) Water cigarettes 水烟
  277. health_status.loc[health_status['da060'] == 1, 'Smoking_Type_pipe'] = 1
  278. health_status.loc[health_status['da060'] == 2, 'Smoking_Type_rolled'] = 2
  279. health_status.loc[health_status['da060'] == 3, 'Smoking_Type_Filtered'] = 3
  280. health_status.loc[health_status['da060'] == 4, 'Smoking_Type_Unfiltered'] = 4
  281. health_status.loc[health_status['da060'] == 5, 'Smoking_Type_Cigar'] = 5
  282. health_status.loc[health_status['da060'] == 6, 'Smoking_Type_Water'] = 6
  283. # 现在/戒烟前平均一天抽多少支香烟?
  284. health_status['Daily_Cigarette_Count']=health_status["da063"]
  285. # 在过去的一年, 喝酒吗
  286. # (1) Drink more than once a month. 喝酒,每月超过一次
  287. # (2) Drink but less than once a month 喝酒,但每月少于一次
  288. # (3) None of these 什么都不喝
  289. health_status['Drink_PastYear']=health_status["da067"]
  290. # 过去一年内 平均一个月喝几次酒
  291. # (1)Once a month 每月一次
  292. # (2)2-3 times a month 每月2-3次
  293. # (3)Once a week 每周一次
  294. # (4)2-3 times a week 每周2-3次
  295. # (5)4-6 times a week 每周4-6次
  296. # (6)Once a day 每天一次
  297. # (7)Twice a day 一天两次
  298. # (8)More than twice a day 一天超过两次
  299. health_status['Drink_Monthly_Frequency']=health_status.apply(lambda x : 8 if x["da072"] ==8 or x["da074"] ==8 or x["da076"] ==8 else
  300. 7 if x["da072"] ==7 or x["da074"] ==7 or x["da076"] ==7 else
  301. 6 if x["da072"] ==6 or x["da074"] ==6 or x["da076"] ==6 else
  302. 5 if x["da072"] ==5 or x["da074"] ==5 or x["da076"] ==5 else
  303. 4 if x["da072"] ==4 or x["da074"] ==4 or x["da076"] ==4 else
  304. 3 if x["da072"] ==3 or x["da074"] ==3 or x["da076"] ==3 else
  305. 2 if x["da072"] ==2 or x["da074"] ==2 or x["da076"] ==2 else
  306. 1 if x["da072"] ==1 or x["da074"] ==1 or x["da076"] ==1 else np.nan, axis=1)
  307. health_status_select = health_status[['ID','householdID', 'communityID', 'da007_1_', 'da007_2_','da007_3_'
  308. ,'da007_4_','da007_5_','da007_6_','da007_7_','da007_8_','da007_9_','da007_10_','da007_11_'
  309. ,'da007_12_','da007_13_','da007_14_', "Physical_activity", "Smoke", "Drink"
  310. , "Accident_Or_Injury", "Fell_In_Last2Years", "Menarche_Year", "Menarche_Age"
  311. , "Menopause_Year", "Menopause_Age", "Prostate_Issue_Year", "Prostate_Issue_Age", "Wear_Glasses"
  312. , "Average_Sleep_Hours", "Average_Nap_Minutes", "Vigorous_Activity_10Min", "Moderate_Effort_10Min"
  313. , "Walking_10Min", "Vigorous_Activity_Days", "Moderate_Effort_Days", "Walking_Days", "Vigorous_Activity_2Hours_PerDay"
  314. , "Moderate_Effort_2Hours_PerDay", "Walking_2Hours_PerDay", 'Vigorous_Activity_30Min_PerDay'
  315. , "Moderate_Effort_30Min_PerDay", "Walking_30Min_PerDay", "Vigorous_Activity_4Hours_PerDay"
  316. , "Moderate_Effort_4Hours_PerDay", "Walking_4Hours_PerDay", "Reason_For_Vigorous_Activity", "Reason_For_Moderate_Effort", "Reason_For_Walking"
  317. , "da056s1", "da056s2", "da056s3", "da056s4", "da056s5", "da056s6", "da056s7", "da056s8", "da056s9"
  318. , "da057_1_" , "da057_2_", "da057_3_", "da057_4_", "da057_5_", "da057_6_", "da057_7_", "da057_8_"
  319. , "Internet_Usage_LastMonth", "Internet_Tools_Desktop_computer","Internet_Tools_Laptop_computer", "Internet_Tools_Tablet_computer", "Internet_Tools_Cellphone", "Internet_Tools_Other"
  320. , "Internet_Purpose_Chat", "Internet_Purpose_news","Internet_Purpose_videos","Internet_Purpose_games","Internet_Purpose_Financial","Internet_Purpose_Others"
  321. , "Mobile_Payment", "Wechat_Usage", "Post_Moments", "Current_Smoking_Status"
  322. , "Smoking_Type_pipe","Smoking_Type_rolled","Smoking_Type_Filtered","Smoking_Type_Unfiltered","Smoking_Type_Cigar","Smoking_Type_Water"
  323. , "Daily_Cigarette_Count", "Drink_PastYear", "Drink_Monthly_Frequency"]]
  324. data_2011 = pd.merge(data_2011, health_status_select, on = ["ID", 'householdID', 'communityID'], how="left")
  325. # 自上次访问以来的两年内,您是否发作过心脏病?
  326. # 1 是
  327. # 0 否
  328. data_2011[['Heart_attack_2_years']]=np.nan
  329. # 自上次访问以来,是否有医生诊断您中风复发?
  330. # 1 是
  331. # 0 否
  332. data_2011[['Recurrent_Stroke']]=np.nan
  333. #计算认知功能得分,分成三部分:电话问卷9分,词语回忆20分、画图1分
  334. health_status["dc001s1_score"] = health_status["dc001s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
  335. health_status["dc001s2_score"] = health_status["dc001s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0)
  336. health_status["dc001s3_score"] = health_status["dc001s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0)
  337. health_status["dc002_score"] = health_status["dc002"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
  338. # health_status["dc003_score"] = health_status["dc003"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
  339. health_status["dc019_score"] = health_status["dc019"].apply(lambda x : 1 if x==93 else 0 if pd.isna(x) else 0)
  340. health_status["dc020_score"] = health_status["dc020"].apply(lambda x : 1 if x==86 else 0 if pd.isna(x) else 0)
  341. health_status["dc021_score"] = health_status["dc021"].apply(lambda x : 1 if x==79 else 0 if pd.isna(x) else 0)
  342. health_status["dc022_score"] = health_status["dc022"].apply(lambda x : 1 if x==72 else 0 if pd.isna(x) else 0)
  343. health_status["dc023_score"] = health_status["dc023"].apply(lambda x : 1 if x==65 else 0 if pd.isna(x) else 0)
  344. #词语记忆
  345. health_status["dc006s1_score"] = health_status["dc006s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
  346. health_status["dc006s2_score"] = health_status["dc006s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0)
  347. health_status["dc006s3_score"] = health_status["dc006s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0)
  348. health_status["dc006s4_score"] = health_status["dc006s4"].apply(lambda x : 1 if x==4 else 0 if pd.isna(x) else 0)
  349. health_status["dc006s5_score"] = health_status["dc006s5"].apply(lambda x : 1 if x==5 else 0 if pd.isna(x) else 0)
  350. health_status["dc006s6_score"] = health_status["dc006s6"].apply(lambda x : 1 if x==6 else 0 if pd.isna(x) else 0)
  351. health_status["dc006s7_score"] = health_status["dc006s7"].apply(lambda x : 1 if x==7 else 0 if pd.isna(x) else 0)
  352. health_status["dc006s8_score"] = health_status["dc006s8"].apply(lambda x : 1 if x==8 else 0 if pd.isna(x) else 0)
  353. health_status["dc006s9_score"] = health_status["dc006s9"].apply(lambda x : 1 if x==9 else 0 if pd.isna(x) else 0)
  354. health_status["dc006s10_score"] = health_status["dc006s10"].apply(lambda x : 1 if x==10 else 0 if pd.isna(x) else 0)
  355. # health_status["dc006s11_score"] = health_status["dc006s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0)
  356. health_status["dc027s1_score"] = health_status["dc027s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
  357. health_status["dc027s2_score"] = health_status["dc027s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0)
  358. health_status["dc027s3_score"] = health_status["dc027s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0)
  359. health_status["dc027s4_score"] = health_status["dc027s4"].apply(lambda x : 1 if x==4 else 0 if pd.isna(x) else 0)
  360. health_status["dc027s5_score"] = health_status["dc027s5"].apply(lambda x : 1 if x==5 else 0 if pd.isna(x) else 0)
  361. health_status["dc027s6_score"] = health_status["dc027s6"].apply(lambda x : 1 if x==6 else 0 if pd.isna(x) else 0)
  362. health_status["dc027s7_score"] = health_status["dc027s7"].apply(lambda x : 1 if x==7 else 0 if pd.isna(x) else 0)
  363. health_status["dc027s8_score"] = health_status["dc027s8"].apply(lambda x : 1 if x==8 else 0 if pd.isna(x) else 0)
  364. health_status["dc027s9_score"] = health_status["dc027s9"].apply(lambda x : 1 if x==9 else 0 if pd.isna(x) else 0)
  365. health_status["dc027s10_score"] = health_status["dc027s10"].apply(lambda x : 1 if x==10 else 0 if pd.isna(x) else 0)
  366. # health_status["dc027s11_score"] = health_status["dc027s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0)
  367. #画图
  368. health_status["draw_score"] = health_status["dc025"].apply(lambda x : 1 if x==1 else 0 if x==2 else np.nan)
  369. data_2011["Cognition_score"] = health_status["dc001s1_score"] + health_status["dc001s2_score"] + \
  370. health_status["dc001s3_score"] + health_status["dc002_score"]+ \
  371. health_status["dc019_score"]+ health_status["dc020_score"] + health_status["dc021_score"]+ \
  372. health_status["dc022_score"]+ health_status["dc023_score"] + health_status["dc006s1_score"] + \
  373. health_status["dc006s2_score"] + health_status["dc006s3_score"] + health_status["dc006s4_score"] + \
  374. health_status["dc006s5_score"] + health_status["dc006s6_score"] + health_status["dc006s7_score"] + \
  375. health_status["dc006s8_score"] + health_status["dc006s9_score"] + health_status["dc006s10_score"] + \
  376. health_status["dc027s1_score"]+ health_status["dc027s2_score"]+ \
  377. health_status["dc027s3_score"]+ health_status["dc027s4_score"]+ health_status["dc027s5_score"]+ \
  378. health_status["dc027s6_score"]+ health_status["dc027s7_score"]+ health_status["dc027s8_score"]+ \
  379. health_status["dc027s9_score"]+health_status["dc027s10_score"]+\
  380. health_status["draw_score"]
  381. #心理得分
  382. health_status["dc009_score"] = health_status["dc009"]-1
  383. health_status["dc010_score"] = health_status["dc010"]-1
  384. health_status["dc011_score"] = health_status["dc011"]-1
  385. health_status["dc012_score"] = health_status["dc012"]-1
  386. health_status["dc013_score"] = 4 - health_status["dc013"]
  387. health_status["dc014_score"] = health_status["dc014"]-1
  388. health_status["dc015_score"] = health_status["dc015"]-1
  389. health_status["dc016_score"] = 4 - health_status["dc016"]
  390. health_status["dc017_score"] = health_status["dc017"]-1
  391. health_status["dc018_score"] = health_status["dc018"]-1
  392. data_2011["psychiatric_score"] = health_status["dc009_score"] + health_status["dc010_score"] + health_status["dc011_score"] + \
  393. health_status["dc012_score"] + health_status["dc013_score"] + health_status["dc014_score"] + health_status["dc015_score"] + \
  394. health_status["dc016_score"] + health_status["dc017_score"] + health_status["dc018_score"]
  395. #睡眠状态
  396. # (1)Rarely or none of the time (<1 day) 很少或者根本没有(<1天)
  397. # (2)Some or a little of the time (1-2 days) 不太多(1-2天)
  398. # (3)Occasionally or a moderate amount of the time (3-4 days) 有时或者说有一半的时间(3-4天)
  399. # (4)Most or all of the time (5-7 days) 大多数的时间(5-7天)
  400. data_2011["sleep_state"] = health_status['dc015']
  401. #ADL
  402. health_status["db010_score"] = health_status["db010"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
  403. health_status["db011_score"] = health_status["db011"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
  404. health_status["db012_score"] = health_status["db012"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
  405. health_status["db013_score"] = health_status["db013"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
  406. health_status["db014_score"] = health_status["db014"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
  407. health_status["db015_score"] = health_status["db015"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
  408. data_2011["ADL"] = health_status["db010_score"] + health_status["db011_score"] + health_status["db012_score"] + health_status["db013_score"] + \
  409. health_status["db014_score"] + health_status["db015_score"]
  410. # 是否有管道煤气或天然气?
  411. houseing["Gas_Connection"] = houseing["i019"].apply(lambda x : 1 if x ==1 else 0 if x == 2 else np.nan)
  412. # 是否带供暖设施(不包括土暖气和可制暖的空调)?
  413. houseing["Heating_Facility"] = houseing["i020"].apply(lambda x : 1 if x ==1 else 0 if x == 2 else np.nan)
  414. # 供暖所用的主要能源是什么?
  415. # (1)Solar 太阳能
  416. # (2)Coal 煤炭、蜂窝煤
  417. # (3)Natural gas 管道天然气或煤气
  418. # (4)Liquefied Petroleum Gas 液化石油气
  419. # (5)Electric 电
  420. # (6)Crop residue/Wood buring 秸秆、柴火
  421. # (7)Other 其他
  422. houseing["Heating_Energy"] = houseing["i021"]
  423. # 做饭用的主要燃料是什么?
  424. # (1)Coal 煤炭、蜂窝煤
  425. # (2)Natural gas 管道天然气或煤气
  426. # (3)Marsh gas 沼气
  427. # (4)Liquefied Petroleum Gas 液化石油气
  428. # (5)Electric 电
  429. # (6)crop residue/Wood burning 秸秆、柴火
  430. # (7)other 其他
  431. houseing["Cooking_Fuel"] = houseing["i022"]
  432. houseing_select = houseing[['householdID', 'communityID','Gas_Connection',
  433. 'Heating_Facility', 'Heating_Energy', 'Cooking_Fuel']]
  434. data_2011 = pd.merge(data_2011, houseing_select, on = ['householdID', 'communityID'], how="left")
  435. data_2011["wave"] = year
  436. change_columns(data_2011)
  437. # 2011年的ID和其他年份有一点区别,倒数第三位加0
  438. data_2011["ID"] = data_2011["ID"].apply(lambda x : x[:-2] + '0' + x[-2:] if len(str(x)) >= 3 else x)
  439. print("2011 complete")
  440. # 2013年
  441. year = "2013"
  442. demo, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Demographic_Background.dta")
  443. psu, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/PSU.dta", encoding='gbk')
  444. biomarkers, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Biomarker.dta")
  445. health_status, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Health_Status_and_Functioning.dta")
  446. health_care, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Health_Care_and_Insurance.dta")
  447. exp_income, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/exp_income_wealth.dta")
  448. weight, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Weights.dta")
  449. houseing, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Housing_Characteristics.dta")
  450. #性别#年龄#婚姻状况
  451. # 1 married or partnered
  452. # 0 other marital status (separated, divorced, unmarried, or widowed)
  453. demo["marital_status"] = demo.apply(lambda x : 1 if x["be001"]==1 or x["be001"]==2 or x["be001"]==7 else 0 if x["be001"] in [3,4,5,6] else np.nan, axis=1)
  454. #教育
  455. # 0 below high school
  456. # 1 high school
  457. # 2 college or above
  458. # 纠正2011年统计错误的教育
  459. demo["education_correct"] = demo.apply(lambda x : x["bd001_w2_3"] if x["bd001_w2_1"]==2 else np.nan, axis=1)
  460. demo["education_correct"] = demo["education_correct"].apply(lambda x : 1 if x == 6 or x == 7 else 2 if x in [8, 9, 10, 11] else 0 if x in [1,2,3,4,5] else np.nan)
  461. education_correct = demo[['ID',"education_correct"]]
  462. # 按 'ID' 列合并两个表
  463. data_2011 = pd.merge(data_2011, education_correct, on='ID', how='left')
  464. # 使用 fillna() 来更新字段
  465. data_2011['education'] = data_2011['education_correct'].fillna(data_2011['education'])
  466. # 删除多余的列
  467. data_2011 = data_2011.drop(columns=['education_correct'])
  468. #更新2013的教育
  469. demo["education"] = demo.apply(lambda x : x["bd001"] if pd.isna(x["bd001_w2_1"]) else x["bd001_w2_4"] if not pd.isna(x["bd001_w2_4"]) and not x["bd001_w2_4"]==12 else np.nan, axis=1)
  470. demo["education"] = demo["education"].apply(lambda x : 1 if x == 6 or x == 7 else 2 if x in [8, 9, 10, 11] else 0 if x in [1,2,3,4,5] else np.nan)
  471. #合并2011年的教育
  472. eductaion_2011 = data_2011[['ID',"education"]]
  473. # 按 'ID' 列合并两个表
  474. demo = pd.merge(demo, eductaion_2011, on='ID', how='left', suffixes=("_2013","_2011"))
  475. # 使用 fillna() 来更新字段
  476. demo['education'] = demo['education_2013'].fillna(demo['education_2011'])
  477. # 纠正2011年统计错误的出生年
  478. demo["birth_year"] = demo.apply(lambda x : x["ba002_1"] if not pd.isna(x["ba002_1"]) else np.nan, axis=1)
  479. demo["birth_month"] = demo.apply(lambda x : x["ba002_2"] if not pd.isna(x["ba002_2"]) else np.nan, axis=1)
  480. birth_year_2013 = demo[['ID',"birth_year", "birth_month"]]
  481. # 按 'ID' 列合并两个表
  482. data_2011 = pd.merge(data_2011, birth_year_2013, on='ID', how='left', suffixes=("_2011","_2013"))
  483. # 使用 fillna() 来更新字段
  484. data_2011['birth_year'] = data_2011['birth_year_2013'].fillna(data_2011['birth_year_2011'])
  485. data_2011['birth_month'] = data_2011['birth_month_2013'].fillna(data_2011['birth_month_2011'])
  486. # 删除多余的列
  487. data_2011 = data_2011.drop(columns=['birth_year_2013', 'birth_year_2011', 'birth_month_2013', 'birth_month_2011'])
  488. #合并2011年的出生年
  489. birth_year_2011 = data_2011[['ID',"birth_year", "birth_month"]]
  490. # 按 'ID' 列合并两个表
  491. demo = pd.merge(demo, birth_year_2011, on='ID', how='left', suffixes=("_2013","_2011"))
  492. # 使用 fillna() 来更新字段
  493. demo['birth_year'] = demo['birth_year_2013'].fillna(demo['birth_year_2011'])
  494. demo['birth_month'] = demo['birth_month_2013'].fillna(demo['birth_month_2011'])
  495. #获取随访时间
  496. demo = pd.merge(demo, weight[["ID", "iyear", "imonth"]], on = "ID", how="left")
  497. data_2013 = demo[['ID','householdID', 'communityID','ba000_w2_3','birth_year','birth_month','ba003',"iyear", "imonth", 'marital_status', "education"]]
  498. #居住地
  499. # 0 农村
  500. # 1 城市
  501. data_2013 = pd.merge(data_2013, psu[['communityID', 'province', 'city', 'urban_nbs']], on = "communityID", how="left")
  502. #身高#体重#收缩压#舒张压
  503. biomarkers["qi002"] = biomarkers["qi002"].apply(lambda x : np.nan if x >210 else x)
  504. biomarkers["ql002"] = biomarkers["ql002"].apply(lambda x : np.nan if x >150 else x)
  505. #腰围
  506. biomarkers['waist'] = biomarkers["qm002"].apply(lambda x : np.nan if x >210 else x)
  507. #血压测量后两次的平均
  508. biomarkers["qa007"] = biomarkers["qa007"].apply(lambda x : np.nan if x >300 else x)
  509. biomarkers["qa011"] = biomarkers["qa011"].apply(lambda x : np.nan if x >300 else x)
  510. biomarkers["qa008"] = biomarkers["qa008"].apply(lambda x : np.nan if x >150 else x)
  511. biomarkers["qa012"] = biomarkers["qa012"].apply(lambda x : np.nan if x >150 else x)
  512. biomarkers["Systolic"] = (biomarkers["qa007"] + biomarkers["qa011"]) /2
  513. biomarkers["Diastolic"] = (biomarkers["qa008"] + biomarkers["qa012"]) /2
  514. #受试者可以在不用手臂支撑的情况下按其平时的节奏连续起立坐下五次吗
  515. # yes
  516. biomarkers["Sit_Stand_5x_yes"] = biomarkers["qh003"]
  517. # no
  518. biomarkers["Sit_Stand_5x_no"] = biomarkers["qh004"]
  519. # times
  520. biomarkers["Sit_Stand_5x_times"] = biomarkers["qh005"]
  521. # 步行速度时间
  522. biomarkers["Walking_Speed_Time_1"] = biomarkers["qg002"]
  523. biomarkers["Walking_Speed_Time_2"] = biomarkers["qg003"]
  524. biomarkers_select = biomarkers[['ID','householdID', 'communityID','qi002','ql002', 'waist','Systolic','Diastolic', "Sit_Stand_5x_yes", "Sit_Stand_5x_no",
  525. "Sit_Stand_5x_times", "Walking_Speed_Time_1", "Walking_Speed_Time_2"]]
  526. data_2013 = pd.merge(data_2013, biomarkers_select, on = ["ID", "householdID", "communityID"], how="left")
  527. #白细胞(WBC),平均红血球容积MCV,血小板,血尿素氮bun,葡萄糖glu,血肌酐crea,总胆固醇cho,甘油三酯tg,高密度脂蛋白HDL,低密度脂蛋白胆固醇LDL,C反应蛋白CRP
  528. #糖化血红蛋白hba1c,尿酸ua,血细胞比容Hematocrit,血红蛋白hgb,胱抑素C
  529. data_2013[['bl_wbc','bl_mcv','bl_plt','bl_bun','bl_glu','bl_crea','bl_cho', 'bl_tg', 'bl_hdl', 'bl_ldl','bl_crp','bl_hbalc','bl_ua', 'bl_hct', 'bl_hgb','bl_cysc']]=np.nan
  530. # 慢性病:
  531. # (1) Hypertension 高血压病
  532. # (2) Dyslipidemia (elevation of low density lipoprotein, triglycerides (TGs),and total cholesterol, or a low high density lipoprotein level)血脂异常(包括低密度脂蛋白、甘油三酯、总胆固醇的升高或(和)高密度脂蛋白的下降)
  533. # (3) Diabetes or high blood sugar糖尿病或血糖升高(包括糖耐量异常和空腹血糖升高)
  534. # (4) Cancer or malignant tumor (excluding minor skin cancers) 癌症等恶性肿瘤(不包括轻度皮肤癌)
  535. # (5) Chronic lung diseases, such as chronic bronchitis , emphysema ( excluding tumors, or cancer) 慢性肺部疾患如慢性支气管炎或肺气肿、肺心病(不包括肿瘤或癌)
  536. # (6) Liver disease (except fatty liver, tumors, and cancer) 肝脏疾病
  537. # (除脂肪肝、肿瘤或癌外)
  538. # (7) Heart attack, coronary heart disease, angina, congestive heart failure, or other heart problems 心脏病(如心肌梗塞、冠心病、心绞痛、充血性心力衰竭和其他心脏疾病)
  539. # (8) Stroke 中风
  540. # (9) Kidney disease (except for tumor or cancer) 肾脏疾病(不包括肿瘤或癌)
  541. # (10) Stomach or other digestive disease (except for tumor or cancer) 胃部疾病或消化系统疾病(不包括肿瘤或癌)
  542. # (11) Emotional, nervous, or psychiatric problems 情感及精神方面问题
  543. # (12) Memory-related disease 与记忆相关的疾病 (如老年痴呆症、脑萎缩、帕金森症)
  544. # (13) Arthritis or rheumatism 关节炎或风湿病
  545. # (14) Asthma 哮喘
  546. # 体力活动
  547. # 2 vigorous (vigorous activity more than once a week)
  548. # 1 moderate (moderate activity more than once a week)
  549. # 0 inactive (the rest)
  550. health_status["Physical_activity"] = health_status.apply(lambda x : 2 if x["da051_1_"]==1 else
  551. 1 if x["da051_2_"]==1 else
  552. 0 if x["da051_3_"] == 1 or (x["da051_1_"]==2 and x["da051_2_"]==2 and x["da051_3_"] == 2)
  553. else np.nan ,axis=1)
  554. # 抽烟
  555. # 1 抽过烟
  556. # 0 没有抽过烟
  557. health_status["Smoke"] = health_status["da059"].apply(lambda x : 1 if x ==1 else 0 if x == 2 else 1)
  558. # 喝酒
  559. # 1 喝过酒
  560. # 0 没有喝过酒
  561. health_status["Drink"] = health_status.apply(lambda x : 1 if x["da067"] ==1 or x["da067"] ==2 else
  562. 0 if x["da069"] == 1 else
  563. 1 if x["da069"] == 2 or x["da069"] == 3 else np.nan, axis=1)
  564. # 您是否经历过交通事故,或任何的重大意外伤害,并接受了治疗?
  565. # 1 是
  566. # 0 否
  567. health_status['Accident_Or_Injury']=health_status["da021"].apply(lambda x : 1 if x ==1 else
  568. 0 if x == 2 else np.nan)
  569. # 过去两年有没有摔倒?
  570. # 1 是
  571. # 0 否
  572. health_status['Fell_In_Last2Years']=health_status["da023"].apply(lambda x : 1 if x ==1 else
  573. 0 if x == 2 else np.nan)
  574. # 您什么时候开始来月经的?(year/age)
  575. health_status['Menarche_Year']=health_status["da026_1"]
  576. health_status['Menarche_Age']=health_status["da026_2"]
  577. # 您什么时候开始绝经的?
  578. health_status['Menopause_Year']=health_status["da028_1"]
  579. health_status['Menopause_Age']=health_status["da028_2"]
  580. # 第一次诊断出您有前列腺疾病是在什么时候?
  581. health_status['Prostate_Issue_Year']=health_status["da030_1"]
  582. health_status['Prostate_Issue_Age']=health_status["da030_2"]
  583. # 是否戴眼镜(包括矫正视力镜片)?
  584. # 1 是
  585. # 0 否
  586. # 2 失明
  587. # 3 偶尔
  588. health_status['Wear_Glasses']=health_status["da032"].apply(lambda x : 1 if x == 1 else 2 if x ==2 else 0 if x == 3 else 3 if x == 4 else np.nan)
  589. # 过去一个月内,您平均每天晚上真正睡着的时间大约是几小时?(可能短于您在床上躺着的时间)
  590. health_status['Average_Sleep_Hours']=health_status["da049"]
  591. # 过去一个月内,您通常午睡多长时间?分钟
  592. health_status['Average_Nap_Minutes']=health_status["da050"]
  593. # 您通常每周有没有至少持续做激烈活动十分钟?
  594. health_status['Vigorous_Activity_10Min']=health_status["da051_1_"]
  595. # 您通常每周有没有至少持续做中等强度的体力活动十分钟?
  596. health_status['Moderate_Effort_10Min']=health_status["da051_2_"]
  597. # 您通常每周有没有至少持续走路十分钟?
  598. health_status['Walking_10Min']=health_status["da051_3_"]
  599. # 您通常每周有多少天做[激烈活动]至少十分钟?
  600. health_status['Vigorous_Activity_Days']=health_status["da052_1_"]
  601. # 您通常每周有多少天做[中等强度的体力活动]至少十分钟?
  602. health_status['Moderate_Effort_Days']=health_status["da052_2_"]
  603. # 您通常每周有多少天做[走路]至少十分钟?
  604. health_status['Walking_Days']=health_status["da052_3_"]
  605. # 在做[激烈活动]的这些天里,您一天花多少时间做[激烈活动] 2小时
  606. health_status['Vigorous_Activity_2Hours_PerDay']=health_status["da053_1_"]
  607. # 在做[中等强度的体力活动]的这些天里,您一天花多少时间做[中等强度的体力活动] 2小时
  608. health_status['Moderate_Effort_2Hours_PerDay']=health_status["da053_2_"]
  609. # 在做[走路]的这些天里,您一天花多少时间做[走路] 2小时
  610. health_status['Walking_2Hours_PerDay']=health_status["da053_3_"]
  611. # 在做[激烈活动]的这些天里,您一天花多少时间做[激烈活动] 30分钟
  612. health_status['Vigorous_Activity_30Min_PerDay']=health_status["da054_1_"]
  613. # 在做[中等强度的体力活动]的这些天里,您一天花多少时间做[中等强度的体力活动] 30分钟
  614. health_status['Moderate_Effort_30Min_PerDay']=health_status["da054_2_"]
  615. # 在做[走路]的这些天里,您一天花多少时间做[走路] 30分钟
  616. health_status['Walking_30Min_PerDay']=health_status["da054_3_"]
  617. # 在做[激烈活动]的这些天里,您一天花多少时间做[激烈活动] 4小时
  618. health_status['Vigorous_Activity_4Hours_PerDay']=health_status["da055_1_"]
  619. # 在做[中等强度的体力活动]的这些天里,您一天花多少时间做[中等强度的体力活动] 4小时
  620. health_status['Moderate_Effort_4Hours_PerDay']=health_status["da055_2_"]
  621. # 在做[走路]的这些天里,您一天花多少时间做[走路] 4小时
  622. health_status['Walking_4Hours_PerDay']=health_status["da055_3_"]
  623. # 活动的原因
  624. # 1 工作需要
  625. # 2 娱乐
  626. # 3 体育锻炼
  627. # 4 其他
  628. health_status["Reason_For_Vigorous_Activity"]= health_status["da051_1_1_"]
  629. health_status["Reason_For_Moderate_Effort"]= health_status["da051_1_2_"]
  630. health_status["Reason_For_Walking"]= health_status["da051_1_3_"]
  631. # 过去一个月是否进行了下列社交活动?
  632. # (1) 串门、跟朋友交往
  633. # (2) 打麻将、下棋、打牌、去社区活动室
  634. # (3) 无偿向与您不住在一起的亲人、朋友或者邻居提供帮助
  635. # (4) 去公园或者其他场所跳舞、健身、练气功等
  636. # (5) 参加社团组织活动
  637. # (6) 志愿者活动或者慈善活动/无偿照顾与您不住在一起的病人或残疾人
  638. # (7) 上学或者参加培训课程
  639. # (8)其他
  640. # (9) 以上均没有
  641. health_status["da056s1"] = health_status.apply(lambda x: 1 if x["da056s1"]==1 else 0, axis=1)
  642. health_status["da056s2"] = health_status.apply(lambda x: 2 if x["da056s2"]==2 else 0, axis=1)
  643. health_status["da056s3"] = health_status.apply(lambda x: 3 if x["da056s3"]==3 else 0, axis=1)
  644. health_status["da056s4"] = health_status.apply(lambda x: 4 if x["da056s4"]==4 else 0, axis=1)
  645. health_status["da056s5"] = health_status.apply(lambda x: 5 if x["da056s5"]==5 else 0, axis=1)
  646. health_status["da056s6"] = health_status.apply(lambda x: 6 if x["da056s6"]==6 or x["da056s7"]==7 else 0, axis=1)
  647. health_status["da056s7"] = health_status.apply(lambda x: 7 if x["da056s8"]==8 else 0, axis=1)
  648. health_status["da056s8"] = health_status.apply(lambda x: 8 if x["da056s9"]==9 or x["da056s10"]==10 or x["da056s11"]==11 else 0, axis=1)
  649. health_status["da056s9"] = health_status.apply(lambda x: 7 if x["da056s12"]==12 else 0, axis=1)
  650. # 过去一个月的活动频率
  651. # (1) Almost daily 差不多每天
  652. # (2) Almost every week 差不多每周
  653. # (3) Not regularly 不经常
  654. health_status["da057_6_"] = health_status.apply(lambda x: 1 if x["da057_6_"]==1 or x["da057_7_"]==1 else 2 if x["da057_6_"]==2 or x["da057_7_"]==2 else 3 if x["da057_6_"]==3 or x["da057_7_"]==3 else np.nan, axis=1)
  655. health_status["da057_7_"] = health_status["da057_8_"]
  656. health_status["da057_8_"] = health_status.apply(lambda x: 1 if x["da057_9_"]==1 or x["da057_10_"]==1 or x["da057_11_"]==1
  657. else 2 if x["da057_9_"]==2 or x["da057_10_"]==2 or x["da057_11_"]==2
  658. else 3 if x["da057_9_"]==3 or x["da057_10_"]==3 or x["da057_11_"]==3
  659. else np.nan, axis=1)
  660. # 过去一个月,您是否上网?
  661. health_status["Internet_Usage_LastMonth"] = health_status["da056s10"].apply(lambda x : 1 if x==10 else 0)
  662. # 使用以下哪些工具上网?
  663. health_status[["Internet_Tools_Desktop_computer"]] = np.nan
  664. health_status[["Internet_Tools_Laptop_computer"]] = np.nan
  665. health_status[["Internet_Tools_Tablet_computer"]] = np.nan
  666. health_status[["Internet_Tools_Cellphone"]] = np.nan
  667. health_status[["Internet_Tools_Other"]] = np.nan
  668. # 上网一般做什么?
  669. health_status[["Internet_Purpose_Chat"]] = np.nan
  670. health_status[["Internet_Purpose_news"]] = np.nan
  671. health_status[["Internet_Purpose_videos"]] = np.nan
  672. health_status[["Internet_Purpose_games"]] = np.nan
  673. health_status[["Internet_Purpose_Financial"]] = np.nan
  674. health_status[["Internet_Purpose_Others"]] = np.nan
  675. # 是否会用手机支付
  676. health_status[["Mobile_Payment"]] = np.nan
  677. # 是否使用微信?
  678. health_status[["Wechat_Usage"]] = np.nan
  679. # 发不发微信朋友圈?
  680. health_status[["Post_Moments"]] = np.nan
  681. # 现在还在吸烟还是戒烟了?
  682. # 1 仍然抽烟 Skip DA062 请跳过DA062
  683. # 2 戒烟
  684. health_status['Current_Smoking_Status']=health_status["da061"]
  685. # 吸烟时,一般抽什么烟?
  686. # (1) Smoking a pipe 用烟管吸烟(烟袋、旱烟)
  687. # (2) Smoking self-rolled cigarettes 自己卷烟抽
  688. # (3) Filtered cigarette带滤咀香烟
  689. # (4) Unfiltered cigarette不带滤咀香烟
  690. # (5) Cigar雪茄
  691. # (6) Water cigarettes 水烟
  692. health_status[['Smoking_Type_pipe']]=np.nan
  693. health_status[['Smoking_Type_rolled']]=np.nan
  694. health_status[['Smoking_Type_Filtered']]=np.nan
  695. health_status[['Smoking_Type_Unfiltered']]=np.nan
  696. health_status[['Smoking_Type_Cigar']]=np.nan
  697. health_status[['Smoking_Type_Water']]=np.nan
  698. # 现在/戒烟前平均一天抽多少支香烟?
  699. health_status['Daily_Cigarette_Count']=health_status["da063"]
  700. # 在过去的一年, 喝酒吗
  701. # (1) Drink more than once a month. 喝酒,每月超过一次
  702. # (2) Drink but less than once a month 喝酒,但每月少于一次
  703. # (3) None of these 什么都不喝
  704. health_status['Drink_PastYear']=health_status["da067"]
  705. # 过去一年内 平均一个月喝几次酒
  706. # (1)Once a month 每月一次
  707. # (2)2-3 times a month 每月2-3次
  708. # (3)Once a week 每周一次
  709. # (4)2-3 times a week 每周2-3次
  710. # (5)4-6 times a week 每周4-6次
  711. # (6)Once a day 每天一次
  712. # (7)Twice a day 一天两次
  713. # (8)More than twice a day 一天超过两次
  714. health_status['Drink_Monthly_Frequency']=health_status.apply(lambda x : 8 if x["da072"] ==8 or x["da074"] ==8 or x["da076"] ==8 else
  715. 7 if x["da072"] ==7 or x["da074"] ==7 or x["da076"] ==7 else
  716. 6 if x["da072"] ==6 or x["da074"] ==6 or x["da076"] ==6 else
  717. 5 if x["da072"] ==5 or x["da074"] ==5 or x["da076"] ==5 else
  718. 4 if x["da072"] ==4 or x["da074"] ==4 or x["da076"] ==4 else
  719. 3 if x["da072"] ==3 or x["da074"] ==3 or x["da076"] ==3 else
  720. 2 if x["da072"] ==2 or x["da074"] ==2 or x["da076"] ==2 else
  721. 1 if x["da072"] ==1 or x["da074"] ==1 or x["da076"] ==1 else np.nan, axis=1)
  722. # 自上次访问以来的两年内,您是否发作过心脏病?
  723. # 1 是
  724. # 0 否
  725. health_status["Heart_attack_2_years"] = health_status.apply(lambda x : 1 if x["da007_w2_5"] ==1 else
  726. 0 if x["da007_w2_5"] == 2 else np.nan, axis=1)
  727. # 自上次访问以来,是否有医生诊断您中风复发?
  728. # 1 是
  729. # 0 否
  730. health_status['Recurrent_Stroke']=health_status.apply(lambda x : 1 if x["da019_w2_1"] ==1 else
  731. 0 if x["da019_w2_1"] == 2 else np.nan, axis=1)
  732. # 合并2011年的慢性病
  733. columns_to_diseases_old = ['da007_1_', 'da007_2_','da007_3_','da007_4_','da007_5_','da007_6_','da007_7_','da007_8_','da007_9_','da007_10_','da007_11_'
  734. ,'da007_12_','da007_13_','da007_14_']
  735. columns_to_diseases_new = ['Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases',
  736. 'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease',
  737. 'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma']
  738. for (col_old, col_new) in zip(columns_to_diseases_old,columns_to_diseases_new):
  739. health_status[col_new] = health_status.apply(lambda x : x[col_old] if not pd.isna(x[col_old]) else np.nan, axis=1)
  740. diseases_2011 = data_2011[['ID','Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases',
  741. 'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease',
  742. 'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma']]
  743. # 按 'ID' 列合并两个表
  744. health_status = pd.merge(health_status, diseases_2011, on='ID', how='left', suffixes=("_2013","_2011"))
  745. # 使用 fillna() 来更新字段
  746. for col in columns_to_diseases_new:
  747. health_status[col] = health_status[f'{col}_2013'].fillna(health_status[f'{col}_2011'])
  748. health_status_select = health_status[['ID','householdID', 'communityID', 'Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases',
  749. 'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease',
  750. 'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma', "Physical_activity", "Smoke", "Drink", "Accident_Or_Injury", "Fell_In_Last2Years", "Menarche_Year", "Menarche_Age",
  751. "Menopause_Year", "Menopause_Age", "Prostate_Issue_Year", "Prostate_Issue_Age", "Wear_Glasses" ,
  752. "Average_Sleep_Hours", "Average_Nap_Minutes", "Vigorous_Activity_10Min", "Moderate_Effort_10Min"
  753. , "Walking_10Min", "Vigorous_Activity_Days", "Moderate_Effort_Days", "Walking_Days", "Vigorous_Activity_2Hours_PerDay"
  754. , "Moderate_Effort_2Hours_PerDay", "Walking_2Hours_PerDay", 'Vigorous_Activity_30Min_PerDay'
  755. , "Moderate_Effort_30Min_PerDay", "Walking_30Min_PerDay", "Vigorous_Activity_4Hours_PerDay"
  756. , "Moderate_Effort_4Hours_PerDay", "Walking_4Hours_PerDay", "Reason_For_Vigorous_Activity", "Reason_For_Moderate_Effort", "Reason_For_Walking"
  757. , "da056s1", "da056s2", "da056s3", "da056s4", "da056s5", "da056s6", "da056s7", "da056s8", "da056s9"
  758. , "da057_1_" , "da057_2_", "da057_3_", "da057_4_", "da057_5_", "da057_6_", "da057_7_", "da057_8_"
  759. , "Internet_Usage_LastMonth", "Internet_Tools_Desktop_computer","Internet_Tools_Laptop_computer", "Internet_Tools_Tablet_computer", "Internet_Tools_Cellphone", "Internet_Tools_Other"
  760. , "Internet_Purpose_Chat", "Internet_Purpose_news","Internet_Purpose_videos","Internet_Purpose_games","Internet_Purpose_Financial","Internet_Purpose_Others"
  761. , "Mobile_Payment", "Wechat_Usage", "Post_Moments", "Current_Smoking_Status"
  762. , "Smoking_Type_pipe","Smoking_Type_rolled","Smoking_Type_Filtered","Smoking_Type_Unfiltered","Smoking_Type_Cigar","Smoking_Type_Water"
  763. , "Daily_Cigarette_Count", "Drink_PastYear", "Drink_Monthly_Frequency", "Heart_attack_2_years", "Recurrent_Stroke"]]
  764. data_2013 = pd.merge(data_2013, health_status_select, on = ["ID", 'householdID', 'communityID'], how="left")
  765. #计算认知功能得分,分成三部分:电话问卷9分,词语回忆10分、画图1分
  766. health_status["dc001s1_score"] = health_status["dc001s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
  767. health_status["dc001s2_score"] = health_status["dc001s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0)
  768. health_status["dc001s3_score"] = health_status["dc001s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0)
  769. health_status["dc002_score"] = health_status["dc002"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
  770. # health_status["dc003_score"] = health_status["dc003"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
  771. health_status["dc019_score"] = health_status["dc019"].apply(lambda x : 1 if x==93 else 0 if pd.isna(x) else 0)
  772. health_status["dc020_score"] = health_status["dc020"].apply(lambda x : 1 if x==86 else 0 if pd.isna(x) else 0)
  773. health_status["dc021_score"] = health_status["dc021"].apply(lambda x : 1 if x==79 else 0 if pd.isna(x) else 0)
  774. health_status["dc022_score"] = health_status["dc022"].apply(lambda x : 1 if x==72 else 0 if pd.isna(x) else 0)
  775. health_status["dc023_score"] = health_status["dc023"].apply(lambda x : 1 if x==65 else 0 if pd.isna(x) else 0)
  776. #词语记忆
  777. health_status["dc006s1_score"] = health_status["dc006_1_s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
  778. health_status["dc006s2_score"] = health_status["dc006_1_s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0)
  779. health_status["dc006s3_score"] = health_status["dc006_1_s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0)
  780. health_status["dc006s4_score"] = health_status["dc006_1_s4"].apply(lambda x : 1 if x==4 else 0 if pd.isna(x) else 0)
  781. health_status["dc006s5_score"] = health_status["dc006_1_s5"].apply(lambda x : 1 if x==5 else 0 if pd.isna(x) else 0)
  782. health_status["dc006s6_score"] = health_status["dc006_1_s6"].apply(lambda x : 1 if x==6 else 0 if pd.isna(x) else 0)
  783. health_status["dc006s7_score"] = health_status["dc006_1_s7"].apply(lambda x : 1 if x==7 else 0 if pd.isna(x) else 0)
  784. health_status["dc006s8_score"] = health_status["dc006_1_s8"].apply(lambda x : 1 if x==8 else 0 if pd.isna(x) else 0)
  785. health_status["dc006s9_score"] = health_status["dc006_1_s9"].apply(lambda x : 1 if x==9 else 0 if pd.isna(x) else 0)
  786. health_status["dc006s10_score"] = health_status["dc006_1_s10"].apply(lambda x : 1 if x==10 else 0 if pd.isna(x) else 0)
  787. # health_status["dc006s11_score"] = health_status["dc006_1_s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0)
  788. health_status["dc027s1_score"] = health_status["dc027s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
  789. health_status["dc027s2_score"] = health_status["dc027s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0)
  790. health_status["dc027s3_score"] = health_status["dc027s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0)
  791. health_status["dc027s4_score"] = health_status["dc027s4"].apply(lambda x : 1 if x==4 else 0 if pd.isna(x) else 0)
  792. health_status["dc027s5_score"] = health_status["dc027s5"].apply(lambda x : 1 if x==5 else 0 if pd.isna(x) else 0)
  793. health_status["dc027s6_score"] = health_status["dc027s6"].apply(lambda x : 1 if x==6 else 0 if pd.isna(x) else 0)
  794. health_status["dc027s7_score"] = health_status["dc027s7"].apply(lambda x : 1 if x==7 else 0 if pd.isna(x) else 0)
  795. health_status["dc027s8_score"] = health_status["dc027s8"].apply(lambda x : 1 if x==8 else 0 if pd.isna(x) else 0)
  796. health_status["dc027s9_score"] = health_status["dc027s9"].apply(lambda x : 1 if x==9 else 0 if pd.isna(x) else 0)
  797. health_status["dc027s10_score"] = health_status["dc027s10"].apply(lambda x : 1 if x==10 else 0 if pd.isna(x) else 0)
  798. # health_status["dc027s11_score"] = health_status["dc027s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0)
  799. #画图
  800. health_status["draw_score"] = health_status["dc025"].apply(lambda x : 1 if x==1 else 0 if x==2 else np.nan)
  801. data_2013["Cognition_score"] = health_status["dc001s1_score"] + health_status["dc001s2_score"] + \
  802. health_status["dc001s3_score"] + health_status["dc002_score"]+ \
  803. health_status["dc019_score"]+ health_status["dc020_score"] + health_status["dc021_score"]+ \
  804. health_status["dc022_score"]+ health_status["dc023_score"] + health_status["dc006s1_score"] + \
  805. health_status["dc006s2_score"] + health_status["dc006s3_score"] + health_status["dc006s4_score"] + \
  806. health_status["dc006s5_score"] + health_status["dc006s6_score"] + health_status["dc006s7_score"] + \
  807. health_status["dc006s8_score"] + health_status["dc006s9_score"] + health_status["dc006s10_score"] + \
  808. health_status["dc027s1_score"]+ health_status["dc027s2_score"]+ \
  809. health_status["dc027s3_score"]+ health_status["dc027s4_score"]+ health_status["dc027s5_score"]+ \
  810. health_status["dc027s6_score"]+ health_status["dc027s7_score"]+ health_status["dc027s8_score"]+ \
  811. health_status["dc027s9_score"]+health_status["dc027s10_score"]+\
  812. health_status["draw_score"]
  813. #心理得分
  814. health_status["dc009_score"] = health_status["dc009"]-1
  815. health_status["dc010_score"] = health_status["dc010"]-1
  816. health_status["dc011_score"] = health_status["dc011"]-1
  817. health_status["dc012_score"] = health_status["dc012"]-1
  818. health_status["dc013_score"] = 4 - health_status["dc013"]
  819. health_status["dc014_score"] = health_status["dc014"]-1
  820. health_status["dc015_score"] = health_status["dc015"]-1
  821. health_status["dc016_score"] = 4 - health_status["dc016"]
  822. health_status["dc017_score"] = health_status["dc017"]-1
  823. health_status["dc018_score"] = health_status["dc018"]-1
  824. data_2013["psychiatric_score"] = health_status["dc009_score"] + health_status["dc010_score"] + health_status["dc011_score"] + \
  825. health_status["dc012_score"] + health_status["dc013_score"] + health_status["dc014_score"] + health_status["dc015_score"] + \
  826. health_status["dc016_score"] + health_status["dc017_score"] + health_status["dc018_score"]
  827. #睡眠状态
  828. # (1)Rarely or none of the time (<1 day) 很少或者根本没有(<1天)
  829. # (2)Some or a little of the time (1-2 days) 不太多(1-2天)
  830. # (3)Occasionally or a moderate amount of the time (3-4 days) 有时或者说有一半的时间(3-4天)
  831. # (4)Most or all of the time (5-7 days) 大多数的时间(5-7天)
  832. data_2013["sleep_state"] = health_status['dc015']
  833. #ADL
  834. health_status["db010_score"] = health_status["db010"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
  835. health_status["db011_score"] = health_status["db011"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
  836. health_status["db012_score"] = health_status["db012"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
  837. health_status["db013_score"] = health_status["db013"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
  838. health_status["db014_score"] = health_status["db014"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
  839. health_status["db015_score"] = health_status["db015"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
  840. data_2013["ADL"] = health_status["db010_score"] + health_status["db011_score"] + health_status["db012_score"] + health_status["db013_score"] + \
  841. health_status["db014_score"] + health_status["db015_score"]
  842. # 是否有管道煤气或天然气?
  843. houseing["Gas_Connection"] = houseing["i019"].apply(lambda x : 1 if x ==1 else 0 if x == 2 else np.nan)
  844. # 是否带供暖设施(不包括土暖气和可制暖的空调)?
  845. houseing["Heating_Facility"] = houseing["i020"].apply(lambda x : 1 if x ==1 else 0 if x == 2 else np.nan)
  846. # 供暖所用的主要能源是什么?
  847. # (1)Solar 太阳能
  848. # (2)Coal 煤炭、蜂窝煤
  849. # (3)Natural gas 管道天然气或煤气
  850. # (4)Liquefied Petroleum Gas 液化石油气
  851. # (5)Electric 电
  852. # (6)Crop residue/Wood buring 秸秆、柴火
  853. # (7)Other 其他
  854. houseing["Heating_Energy"] = houseing["i021"]
  855. # 做饭用的主要燃料是什么?
  856. # (1)Coal 煤炭、蜂窝煤
  857. # (2)Natural gas 管道天然气或煤气
  858. # (3)Marsh gas 沼气
  859. # (4)Liquefied Petroleum Gas 液化石油气
  860. # (5)Electric 电
  861. # (6)crop residue/Wood burning 秸秆、柴火
  862. # (7)other 其他
  863. houseing["Cooking_Fuel"] = houseing["i022"]
  864. houseing_select = houseing[['ID','householdID', 'communityID','Gas_Connection',
  865. 'Heating_Facility', 'Heating_Energy', 'Cooking_Fuel']]
  866. data_2013 = pd.merge(data_2013, houseing_select, on = ["ID", 'householdID', 'communityID'], how="left")
  867. data_2013["wave"] = year
  868. change_columns(data_2013)
  869. data_2013 = pd.concat([data_2011, data_2013], axis=0)
  870. print("2013 complete")
  871. # 2015年
  872. year = "2015"
  873. demo, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Demographic_Background.dta")
  874. psu, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS2013/PSU.dta", encoding='gbk')
  875. blood, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Blood.dta")
  876. biomarkers, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Biomarker.dta")
  877. health_status, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Health_Status_and_Functioning.dta")
  878. health_care, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Health_Care_and_Insurance.dta")
  879. weight, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Sample_Infor.dta")
  880. houseing, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Housing_Characteristics.dta")
  881. #性别#年龄#婚姻状况
  882. # 1 married or partnered
  883. # 0 other marital status (separated, divorced, unmarried, or widowed)
  884. demo["marital_status"] = demo.apply(lambda x : 1 if x["be001"]==1 or x["be001"]==2 or x["be001"]==7 else 0 if x["be001"] in [3,4,5,6] else np.nan, axis=1)
  885. #教育
  886. # 0 below high school
  887. # 1 high school
  888. # 2 college or above
  889. #更新2015的教育
  890. demo["education"] = demo.apply(lambda x : x["bd001_w2_4"] if not pd.isna(x["bd001_w2_4"]) and not x["bd001_w2_4"]==12 else np.nan, axis=1)
  891. demo["education"] = demo["education"].apply(lambda x : 1 if x == 6 or x == 7 else 2 if x in [8, 9, 10, 11] else 0 if x in [1,2,3,4,5] else np.nan)
  892. #合并2013年的教育
  893. eductaion_2013 = data_2013[data_2013["wave"]=="2013"][['ID',"education"]]
  894. # 按 'ID' 列合并两个表
  895. demo = pd.merge(demo, eductaion_2013, on='ID', how='left', suffixes=("_2015","_2013"))
  896. # 使用 fillna() 来更新字段
  897. demo['education'] = demo['education_2015'].fillna(demo['education_2013'])
  898. # 2015年的出生年
  899. demo["birth_year"] = demo.apply(lambda x : x["ba004_w3_1"] if x["ba002"]==1 else x["ba002_1"] if x["ba002"]==2 else np.nan, axis=1)
  900. demo["birth_month"] = demo.apply(lambda x : x["ba004_w3_2"] if x["ba002"]==1 else x["ba002_2"] if x["ba002"]==2 else np.nan, axis=1)
  901. #获取随访时间
  902. demo = pd.merge(demo, weight[["ID", "iyear", "imonth"]], on = "ID", how="left")
  903. data_2015 = demo[['ID','householdID', 'communityID','ba000_w2_3', 'birth_year','birth_month','ba003',"iyear", "imonth", 'marital_status', 'education']]
  904. #居住地
  905. # 0 农村
  906. # 1 城市
  907. data_2015 = pd.merge(data_2015, psu[['communityID', 'province', 'city', 'urban_nbs']], on = "communityID", how="left")
  908. #身高#体重#收缩压#舒张压
  909. biomarkers["qi002"] = biomarkers["qi002"].apply(lambda x : np.nan if x >210 else x)
  910. biomarkers["ql002"] = biomarkers["ql002"].apply(lambda x : np.nan if x >150 else x)
  911. #腰围
  912. biomarkers['waist'] = biomarkers["qm002"].apply(lambda x : np.nan if x >210 else x)
  913. #血压测量后两次的平均
  914. biomarkers["qa007"] = biomarkers["qa007"].apply(lambda x : np.nan if x >300 else x)
  915. biomarkers["qa011"] = biomarkers["qa011"].apply(lambda x : np.nan if x >300 else x)
  916. biomarkers["qa008"] = biomarkers["qa008"].apply(lambda x : np.nan if x >150 else x)
  917. biomarkers["qa012"] = biomarkers["qa012"].apply(lambda x : np.nan if x >150 else x)
  918. biomarkers["Systolic"] = (biomarkers["qa007"] + biomarkers["qa011"]) /2
  919. biomarkers["Diastolic"] = (biomarkers["qa008"] + biomarkers["qa012"]) /2
  920. #受试者可以在不用手臂支撑的情况下按其平时的节奏连续起立坐下五次吗
  921. # yes
  922. biomarkers["Sit_Stand_5x_yes"] = biomarkers["qh003"]
  923. # no
  924. biomarkers["Sit_Stand_5x_no"] = biomarkers["qh004"]
  925. # times
  926. biomarkers["Sit_Stand_5x_times"] = biomarkers["qh005"]
  927. # 步行速度时间
  928. biomarkers["Walking_Speed_Time_1"] = biomarkers["qg002"]
  929. biomarkers["Walking_Speed_Time_2"] = biomarkers["qg003"]
  930. #身高#体重#收缩压#舒张压
  931. biomarkers_select = biomarkers[['ID','householdID', 'communityID','qi002', 'ql002', 'waist', 'Systolic','Diastolic', "Sit_Stand_5x_yes", "Sit_Stand_5x_no",
  932. "Sit_Stand_5x_times", "Walking_Speed_Time_1", "Walking_Speed_Time_2"]]
  933. data_2015 = pd.merge(data_2015, biomarkers_select, on = ["ID", "householdID", "communityID"], how="left")
  934. #白细胞(WBC),平均红血球容积MCV,血小板,血尿素氮bun,葡萄糖glu,血肌酐crea,总胆固醇cho,甘油三酯tg,高密度脂蛋白HDL,低密度脂蛋白胆固醇LDL,C反应蛋白CRP
  935. #糖化血红蛋白hba1c,尿酸ua,血细胞比容Hematocrit,血红蛋白hgb,胱抑素C
  936. blood = blood[['ID', 'bl_wbc','bl_mcv','bl_plt','bl_bun','bl_glu','bl_crea','bl_cho', 'bl_tg', 'bl_hdl', 'bl_ldl','bl_crp','bl_hbalc','bl_ua', 'bl_hct', 'bl_hgb','bl_cysc']]
  937. data_2015 = pd.merge(data_2015, blood, on = ["ID"], how="left")
  938. # 慢性病:
  939. # (1) Hypertension 高血压病
  940. # (2) Dyslipidemia (elevation of low density lipoprotein, triglycerides (TGs),and total cholesterol, or a low high density lipoprotein level)血脂异常(包括低密度脂蛋白、甘油三酯、总胆固醇的升高或(和)高密度脂蛋白的下降)
  941. # (3) Diabetes or high blood sugar糖尿病或血糖升高(包括糖耐量异常和空腹血糖升高)
  942. # (4) Cancer or malignant tumor (excluding minor skin cancers) 癌症等恶性肿瘤(不包括轻度皮肤癌)
  943. # (5) Chronic lung diseases, such as chronic bronchitis , emphysema ( excluding tumors, or cancer) 慢性肺部疾患如慢性支气管炎或肺气肿、肺心病(不包括肿瘤或癌)
  944. # (6) Liver disease (except fatty liver, tumors, and cancer) 肝脏疾病
  945. # (除脂肪肝、肿瘤或癌外)
  946. # (7) Heart attack, coronary heart disease, angina, congestive heart failure, or other heart problems 心脏病(如心肌梗塞、冠心病、心绞痛、充血性心力衰竭和其他心脏疾病)
  947. # (8) Stroke 中风
  948. # (9) Kidney disease (except for tumor or cancer) 肾脏疾病(不包括肿瘤或癌)
  949. # (10) Stomach or other digestive disease (except for tumor or cancer) 胃部疾病或消化系统疾病(不包括肿瘤或癌)
  950. # (11) Emotional, nervous, or psychiatric problems 情感及精神方面问题
  951. # (12) Memory-related disease 与记忆相关的疾病 (如老年痴呆症、脑萎缩、帕金森症)
  952. # (13) Arthritis or rheumatism 关节炎或风湿病
  953. # (14) Asthma 哮喘
  954. # 体力活动
  955. # 2 vigorous (vigorous activity more than once a week)
  956. # 1 moderate (moderate activity more than once a week)
  957. # 0 inactive (the rest)
  958. health_status["Physical_activity"] = health_status.apply(lambda x : 2 if x["da051_1_"]==1 else
  959. 1 if x["da051_2_"]==1 else
  960. 0 if x["da051_3_"] == 1 or (x["da051_1_"]==2 and x["da051_2_"]==2 and x["da051_3_"] == 2)
  961. else np.nan ,axis=1)
  962. # 抽烟
  963. # 1 抽过烟
  964. # 0 没有抽过烟
  965. health_status["Smoke"] = health_status["da059"].apply(lambda x : 1 if x ==1 else 0 if x == 2 else 1)
  966. # 喝酒
  967. # 1 喝过酒
  968. # 0 没有喝过酒
  969. health_status["Drink"] = health_status.apply(lambda x : 1 if x["da067"] ==1 or x["da067"] ==2 else
  970. 0 if x["da069"] == 1 else
  971. 1 if x["da069"] == 2 or x["da069"] == 3 else np.nan, axis=1)
  972. # 您是否经历过交通事故,或任何的重大意外伤害,并接受了治疗?
  973. # 1 是
  974. # 0 否
  975. health_status['Accident_Or_Injury']=health_status["da021"].apply(lambda x : 1 if x ==1 else
  976. 0 if x == 2 else np.nan)
  977. # 过去两年有没有摔倒?
  978. # 1 是
  979. # 0 否
  980. health_status['Fell_In_Last2Years']=health_status["da023"].apply(lambda x : 1 if x ==1 else
  981. 0 if x == 2 else np.nan)
  982. # 您什么时候开始来月经的?(year/age)
  983. health_status['Menarche_Year']=health_status["da026_1"]
  984. health_status['Menarche_Age']=health_status["da026_2"]
  985. # 您什么时候开始绝经的?
  986. health_status['Menopause_Year']=health_status["da028_1"]
  987. health_status['Menopause_Age']=health_status["da028_2"]
  988. # 第一次诊断出您有前列腺疾病是在什么时候?
  989. health_status['Prostate_Issue_Year']=health_status["da030_1"]
  990. health_status['Prostate_Issue_Age']=health_status["da030_2"]
  991. # 是否戴眼镜(包括矫正视力镜片)?
  992. # 1 是
  993. # 0 否
  994. # 2 失明
  995. # 3 偶尔
  996. health_status['Wear_Glasses']=health_status["da032"].apply(lambda x : 1 if x == 1 else 2 if x ==2 else 0 if x == 3 else 3 if x == 4 else np.nan)
  997. # 过去一个月内,您平均每天晚上真正睡着的时间大约是几小时?(可能短于您在床上躺着的时间)
  998. health_status['Average_Sleep_Hours']=health_status["da049"]
  999. # 过去一个月内,您通常午睡多长时间?分钟
  1000. health_status['Average_Nap_Minutes']=health_status["da050"]
  1001. # 您通常每周有没有至少持续做激烈活动十分钟?
  1002. health_status['Vigorous_Activity_10Min']=health_status["da051_1_"]
  1003. # 您通常每周有没有至少持续做中等强度的体力活动十分钟?
  1004. health_status['Moderate_Effort_10Min']=health_status["da051_2_"]
  1005. # 您通常每周有没有至少持续走路十分钟?
  1006. health_status['Walking_10Min']=health_status["da051_3_"]
  1007. # 您通常每周有多少天做[激烈活动]至少十分钟?
  1008. health_status['Vigorous_Activity_Days']=health_status["da052_1_"]
  1009. # 您通常每周有多少天做[中等强度的体力活动]至少十分钟?
  1010. health_status['Moderate_Effort_Days']=health_status["da052_2_"]
  1011. # 您通常每周有多少天做[走路]至少十分钟?
  1012. health_status['Walking_Days']=health_status["da052_3_"]
  1013. # 在做[激烈活动]的这些天里,您一天花多少时间做[激烈活动] 2小时
  1014. health_status['Vigorous_Activity_2Hours_PerDay']=health_status["da053_1_"]
  1015. # 在做[中等强度的体力活动]的这些天里,您一天花多少时间做[中等强度的体力活动] 2小时
  1016. health_status['Moderate_Effort_2Hours_PerDay']=health_status["da053_2_"]
  1017. # 在做[走路]的这些天里,您一天花多少时间做[走路] 2小时
  1018. health_status['Walking_2Hours_PerDay']=health_status["da053_3_"]
  1019. # 在做[激烈活动]的这些天里,您一天花多少时间做[激烈活动] 30分钟
  1020. health_status['Vigorous_Activity_30Min_PerDay']=health_status["da054_1_"]
  1021. # 在做[中等强度的体力活动]的这些天里,您一天花多少时间做[中等强度的体力活动] 30分钟
  1022. health_status['Moderate_Effort_30Min_PerDay']=health_status["da054_2_"]
  1023. # 在做[走路]的这些天里,您一天花多少时间做[走路] 30分钟
  1024. health_status['Walking_30Min_PerDay']=health_status["da054_3_"]
  1025. # 在做[激烈活动]的这些天里,您一天花多少时间做[激烈活动] 4小时
  1026. health_status['Vigorous_Activity_4Hours_PerDay']=health_status["da055_1_"]
  1027. # 在做[中等强度的体力活动]的这些天里,您一天花多少时间做[中等强度的体力活动] 4小时
  1028. health_status['Moderate_Effort_4Hours_PerDay']=health_status["da055_2_"]
  1029. # 在做[走路]的这些天里,您一天花多少时间做[走路] 4小时
  1030. health_status['Walking_4Hours_PerDay']=health_status["da055_3_"]
  1031. # 活动的原因
  1032. # 1 工作需要
  1033. # 2 娱乐
  1034. # 3 体育锻炼
  1035. # 4 其他
  1036. health_status["Reason_For_Vigorous_Activity"]= health_status["da051_1_1_"]
  1037. health_status["Reason_For_Moderate_Effort"]= health_status["da051_1_2_"]
  1038. health_status["Reason_For_Walking"]= health_status["da051_1_3_"]
  1039. # 过去一个月是否进行了下列社交活动?
  1040. # (1) 串门、跟朋友交往
  1041. # (2) 打麻将、下棋、打牌、去社区活动室
  1042. # (3) 无偿向与您不住在一起的亲人、朋友或者邻居提供帮助
  1043. # (4) 去公园或者其他场所跳舞、健身、练气功等
  1044. # (5) 参加社团组织活动
  1045. # (6) 志愿者活动或者慈善活动/无偿照顾与您不住在一起的病人或残疾人
  1046. # (7) 上学或者参加培训课程
  1047. # (8)其他
  1048. # (9) 以上均没有
  1049. health_status["da056s1"] = health_status.apply(lambda x: 1 if x["da056s1"]==1 else 0, axis=1)
  1050. health_status["da056s2"] = health_status.apply(lambda x: 2 if x["da056s2"]==2 else 0, axis=1)
  1051. health_status["da056s3"] = health_status.apply(lambda x: 3 if x["da056s3"]==3 else 0, axis=1)
  1052. health_status["da056s4"] = health_status.apply(lambda x: 4 if x["da056s4"]==4 else 0, axis=1)
  1053. health_status["da056s5"] = health_status.apply(lambda x: 5 if x["da056s5"]==5 else 0, axis=1)
  1054. health_status["da056s6"] = health_status.apply(lambda x: 6 if x["da056s6"]==6 or x["da056s7"]==7 else 0, axis=1)
  1055. health_status["da056s7"] = health_status.apply(lambda x: 7 if x["da056s8"]==8 else 0, axis=1)
  1056. health_status["da056s8"] = health_status.apply(lambda x: 8 if x["da056s9"]==9 or x["da056s10"]==10 or x["da056s11"]==11 else 0, axis=1)
  1057. health_status["da056s9"] = health_status.apply(lambda x: 7 if x["da056s12"]==12 else 0, axis=1)
  1058. # 过去一个月的活动频率
  1059. # (1) Almost daily 差不多每天
  1060. # (2) Almost every week 差不多每周
  1061. # (3) Not regularly 不经常
  1062. health_status["da057_6_"] = health_status.apply(lambda x: 1 if x["da057_6_"]==1 or x["da057_7_"]==1 else 2 if x["da057_6_"]==2 or x["da057_7_"]==2 else 3 if x["da057_6_"]==3 or x["da057_7_"]==3 else np.nan, axis=1)
  1063. health_status["da057_7_"] = health_status["da057_8_"]
  1064. health_status["da057_8_"] = health_status.apply(lambda x: 1 if x["da057_9_"]==1 or x["da057_10_"]==1 or x["da057_11_"]==1
  1065. else 2 if x["da057_9_"]==2 or x["da057_10_"]==2 or x["da057_11_"]==2
  1066. else 3 if x["da057_9_"]==3 or x["da057_10_"]==3 or x["da057_11_"]==3
  1067. else np.nan, axis=1)
  1068. # 过去一个月,您是否上网?
  1069. health_status["Internet_Usage_LastMonth"] = health_status["da056s10"].apply(lambda x : 1 if x==10 else 0)
  1070. # 使用以下哪些工具上网?
  1071. # 1. Desktop computer 台式电脑
  1072. # 2. Laptop computer 笔记本电脑
  1073. # 3. Tablet computer 平板电脑(如 IPAD)
  1074. # 4. Cellphone 手机
  1075. # 5. Other devices 其他设备
  1076. health_status["Internet_Tools_Desktop_computer"] = health_status["da056_w3s1"]
  1077. health_status["Internet_Tools_Laptop_computer"] = health_status["da056_w3s2"]
  1078. health_status["Internet_Tools_Tablet_computer"] = health_status["da056_w3s3"]
  1079. health_status["Internet_Tools_Cellphone"] = health_status["da056_w3s4"]
  1080. health_status["Internet_Tools_Other"] = health_status["da056_w3s5"]
  1081. # 上网一般做什么?
  1082. health_status[["Internet_Purpose_Chat"]] = np.nan
  1083. health_status[["Internet_Purpose_news"]] = np.nan
  1084. health_status[["Internet_Purpose_videos"]] = np.nan
  1085. health_status[["Internet_Purpose_games"]] = np.nan
  1086. health_status[["Internet_Purpose_Financial"]] = np.nan
  1087. health_status[["Internet_Purpose_Others"]] = np.nan
  1088. # 是否会用手机支付
  1089. health_status[["Mobile_Payment"]] = np.nan
  1090. # 是否使用微信?
  1091. health_status[["Wechat_Usage"]] = np.nan
  1092. # 发不发微信朋友圈?
  1093. health_status[["Post_Moments"]] = np.nan
  1094. # 现在还在吸烟还是戒烟了?
  1095. # 1 仍然抽烟
  1096. # 2 戒烟
  1097. health_status['Current_Smoking_Status']=health_status["da061"].apply(lambda x : 1 if x == 1 else 2 if x ==2 else np.nan)
  1098. # 更新一下2013年没有回答的用户
  1099. health_status["da061_w3"] = health_status["da061_w3"].apply(lambda x : 1 if x ==1 else 2 if x ==2 else np.nan)
  1100. data_2013 = pd.merge(data_2013, health_status[['ID',"da061_w3"]], on='ID', how='left')
  1101. data_2013['Current_Smoking_Status'] = data_2013['Current_Smoking_Status'].fillna(data_2013['da061_w3'])
  1102. data_2013 = data_2013.drop('da061_w3', axis=1)
  1103. # 按 'ID' 列合并两个表
  1104. # 吸烟时,一般抽什么烟?
  1105. # (1) Smoking a pipe 用烟管吸烟(烟袋、旱烟)
  1106. # (2) Smoking self-rolled cigarettes 自己卷烟抽
  1107. # (3) Filtered cigarette带滤咀香烟
  1108. # (4) Unfiltered cigarette不带滤咀香烟
  1109. # (5) Cigar雪茄
  1110. # (6) Water cigarettes 水烟
  1111. health_status['Smoking_Type_pipe']=health_status["da060s1"]
  1112. health_status['Smoking_Type_rolled']=health_status["da060s2"]
  1113. health_status['Smoking_Type_Filtered']=health_status["da060s3"]
  1114. health_status['Smoking_Type_Unfiltered']=health_status["da060s4"]
  1115. health_status['Smoking_Type_Cigar']=health_status["da060s5"]
  1116. health_status['Smoking_Type_Water']=health_status["da060s6"]
  1117. # 现在/戒烟前平均一天抽多少支香烟?
  1118. health_status['Daily_Cigarette_Count']=health_status["da063"]
  1119. # 在过去的一年, 喝酒吗
  1120. # (1) Drink more than once a month. 喝酒,每月超过一次
  1121. # (2) Drink but less than once a month 喝酒,但每月少于一次
  1122. # (3) None of these 什么都不喝
  1123. health_status['Drink_PastYear']=health_status["da067"]
  1124. # 过去一年内 平均一个月喝几次酒
  1125. # (1)Once a month 每月一次
  1126. # (2)2-3 times a month 每月2-3次
  1127. # (3)Once a week 每周一次
  1128. # (4)2-3 times a week 每周2-3次
  1129. # (5)4-6 times a week 每周4-6次
  1130. # (6)Once a day 每天一次
  1131. # (7)Twice a day 一天两次
  1132. # (8)More than twice a day 一天超过两次
  1133. health_status['Drink_Monthly_Frequency']=health_status.apply(lambda x : 8 if x["da072"] ==8 or x["da074"] ==8 or x["da076"] ==8 else
  1134. 7 if x["da072"] ==7 or x["da074"] ==7 or x["da076"] ==7 else
  1135. 6 if x["da072"] ==6 or x["da074"] ==6 or x["da076"] ==6 else
  1136. 5 if x["da072"] ==5 or x["da074"] ==5 or x["da076"] ==5 else
  1137. 4 if x["da072"] ==4 or x["da074"] ==4 or x["da076"] ==4 else
  1138. 3 if x["da072"] ==3 or x["da074"] ==3 or x["da076"] ==3 else
  1139. 2 if x["da072"] ==2 or x["da074"] ==2 or x["da076"] ==2 else
  1140. 1 if x["da072"] ==1 or x["da074"] ==1 or x["da076"] ==1 else np.nan, axis=1)
  1141. # 自上次访问以来的两年内,您是否发作过心脏病?
  1142. # 1 是
  1143. # 0 否
  1144. health_status["Heart_attack_2_years"] = health_status.apply(lambda x : 1 if x["da007_w2_5"] ==1 else
  1145. 0 if x["da007_w2_5"] == 2 else np.nan, axis=1)
  1146. # 自上次访问以来,是否有医生诊断您中风复发?
  1147. # 1 是
  1148. # 0 否
  1149. health_status['Recurrent_Stroke']=health_status.apply(lambda x : 1 if x["da019_w2_1"] ==1 else
  1150. 0 if x["da019_w2_1"] == 2 else np.nan, axis=1)
  1151. # 合并2013年的慢性病
  1152. columns_to_diseases_old = ['da007_1_', 'da007_2_','da007_3_','da007_4_','da007_5_','da007_6_','da007_7_','da007_8_','da007_9_','da007_10_','da007_11_'
  1153. ,'da007_12_','da007_13_','da007_14_']
  1154. columns_to_diseases_new = ['Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases',
  1155. 'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease',
  1156. 'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma']
  1157. for (col_old, col_new) in zip(columns_to_diseases_old,columns_to_diseases_new):
  1158. health_status[col_new] = health_status.apply(lambda x : x[col_old] if not pd.isna(x[col_old]) else np.nan, axis=1)
  1159. diseases_2013 = data_2013[data_2013["wave"]=="2013"][['ID','Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases',
  1160. 'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease',
  1161. 'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma']]
  1162. # 按 'ID' 列合并两个表
  1163. health_status = pd.merge(health_status, diseases_2013, on='ID', how='left', suffixes=("_2015","_2013"))
  1164. # 使用 fillna() 来更新字段
  1165. for col in columns_to_diseases_new:
  1166. health_status[col] = health_status[f'{col}_2015'].fillna(health_status[f'{col}_2013'])
  1167. health_status_select = health_status[['ID','householdID', 'communityID', 'Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases',
  1168. 'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease',
  1169. 'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma', "Physical_activity", "Smoke", "Drink", "Accident_Or_Injury", "Fell_In_Last2Years", "Menarche_Year", "Menarche_Age",
  1170. "Menopause_Year", "Menopause_Age", "Prostate_Issue_Year", "Prostate_Issue_Age", "Wear_Glasses" ,
  1171. "Average_Sleep_Hours", "Average_Nap_Minutes", "Vigorous_Activity_10Min", "Moderate_Effort_10Min"
  1172. , "Walking_10Min", "Vigorous_Activity_Days", "Moderate_Effort_Days", "Walking_Days", "Vigorous_Activity_2Hours_PerDay"
  1173. , "Moderate_Effort_2Hours_PerDay", "Walking_2Hours_PerDay", 'Vigorous_Activity_30Min_PerDay'
  1174. , "Moderate_Effort_30Min_PerDay", "Walking_30Min_PerDay", "Vigorous_Activity_4Hours_PerDay"
  1175. , "Moderate_Effort_4Hours_PerDay", "Walking_4Hours_PerDay", "Reason_For_Vigorous_Activity", "Reason_For_Moderate_Effort", "Reason_For_Walking"
  1176. , "da056s1", "da056s2", "da056s3", "da056s4", "da056s5", "da056s6", "da056s7", "da056s8", "da056s9"
  1177. , "da057_1_" , "da057_2_", "da057_3_", "da057_4_", "da057_5_", "da057_6_", "da057_7_", "da057_8_"
  1178. , "Internet_Usage_LastMonth", "Internet_Tools_Desktop_computer","Internet_Tools_Laptop_computer", "Internet_Tools_Tablet_computer", "Internet_Tools_Cellphone", "Internet_Tools_Other"
  1179. , "Internet_Purpose_Chat", "Internet_Purpose_news", "Internet_Purpose_videos", "Internet_Purpose_games", "Internet_Purpose_Financial", "Internet_Purpose_Others"
  1180. , "Mobile_Payment", "Wechat_Usage", "Post_Moments", "Current_Smoking_Status"
  1181. , "Smoking_Type_pipe","Smoking_Type_rolled","Smoking_Type_Filtered","Smoking_Type_Unfiltered","Smoking_Type_Cigar","Smoking_Type_Water"
  1182. , "Daily_Cigarette_Count", "Drink_PastYear", "Drink_Monthly_Frequency", "Heart_attack_2_years", "Recurrent_Stroke"]]
  1183. data_2015 = pd.merge(data_2015, health_status_select, on = ["ID", 'householdID', 'communityID'], how="left")
  1184. #计算认知功能得分,分成三部分:电话问卷9分,词语回忆10分、画图1分
  1185. health_status["dc001s1_score"] = health_status["dc001s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
  1186. health_status["dc001s2_score"] = health_status["dc001s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0)
  1187. health_status["dc001s3_score"] = health_status["dc001s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0)
  1188. health_status["dc002_score"] = health_status["dc002"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
  1189. # health_status["dc003_score"] = health_status["dc003"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
  1190. health_status["dc019_score"] = health_status["dc019"].apply(lambda x : 1 if x==93 else 0 if pd.isna(x) else 0)
  1191. health_status["dc020_score"] = health_status["dc020"].apply(lambda x : 1 if x==86 else 0 if pd.isna(x) else 0)
  1192. health_status["dc021_score"] = health_status["dc021"].apply(lambda x : 1 if x==79 else 0 if pd.isna(x) else 0)
  1193. health_status["dc022_score"] = health_status["dc022"].apply(lambda x : 1 if x==72 else 0 if pd.isna(x) else 0)
  1194. health_status["dc023_score"] = health_status["dc023"].apply(lambda x : 1 if x==65 else 0 if pd.isna(x) else 0)
  1195. #词语记忆
  1196. health_status["dc006s1_score"] = health_status["dc006s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
  1197. health_status["dc006s2_score"] = health_status["dc006s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0)
  1198. health_status["dc006s3_score"] = health_status["dc006s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0)
  1199. health_status["dc006s4_score"] = health_status["dc006s4"].apply(lambda x : 1 if x==4 else 0 if pd.isna(x) else 0)
  1200. health_status["dc006s5_score"] = health_status["dc006s5"].apply(lambda x : 1 if x==5 else 0 if pd.isna(x) else 0)
  1201. health_status["dc006s6_score"] = health_status["dc006s6"].apply(lambda x : 1 if x==6 else 0 if pd.isna(x) else 0)
  1202. health_status["dc006s7_score"] = health_status["dc006s7"].apply(lambda x : 1 if x==7 else 0 if pd.isna(x) else 0)
  1203. health_status["dc006s8_score"] = health_status["dc006s8"].apply(lambda x : 1 if x==8 else 0 if pd.isna(x) else 0)
  1204. health_status["dc006s9_score"] = health_status["dc006s9"].apply(lambda x : 1 if x==9 else 0 if pd.isna(x) else 0)
  1205. health_status["dc006s10_score"] = health_status["dc006s10"].apply(lambda x : 1 if x==10 else 0 if pd.isna(x) else 0)
  1206. # health_status["dc006s11_score"] = health_status["dc006s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0)
  1207. health_status["dc027s1_score"] = health_status["dc027s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
  1208. health_status["dc027s2_score"] = health_status["dc027s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0)
  1209. health_status["dc027s3_score"] = health_status["dc027s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0)
  1210. health_status["dc027s4_score"] = health_status["dc027s4"].apply(lambda x : 1 if x==4 else 0 if pd.isna(x) else 0)
  1211. health_status["dc027s5_score"] = health_status["dc027s5"].apply(lambda x : 1 if x==5 else 0 if pd.isna(x) else 0)
  1212. health_status["dc027s6_score"] = health_status["dc027s6"].apply(lambda x : 1 if x==6 else 0 if pd.isna(x) else 0)
  1213. health_status["dc027s7_score"] = health_status["dc027s7"].apply(lambda x : 1 if x==7 else 0 if pd.isna(x) else 0)
  1214. health_status["dc027s8_score"] = health_status["dc027s8"].apply(lambda x : 1 if x==8 else 0 if pd.isna(x) else 0)
  1215. health_status["dc027s9_score"] = health_status["dc027s9"].apply(lambda x : 1 if x==9 else 0 if pd.isna(x) else 0)
  1216. health_status["dc027s10_score"] = health_status["dc027s10"].apply(lambda x : 1 if x==10 else 0 if pd.isna(x) else 0)
  1217. # health_status["dc027s11_score"] = health_status["dc027s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0)
  1218. #画图
  1219. health_status["draw_score"] = health_status["dc025"].apply(lambda x : 1 if x==1 else 0 if x==2 else np.nan)
  1220. data_2015["Cognition_score"] = health_status["dc001s1_score"] + health_status["dc001s2_score"] + \
  1221. health_status["dc001s3_score"] + health_status["dc002_score"]+ \
  1222. health_status["dc019_score"]+ health_status["dc020_score"] + health_status["dc021_score"]+ \
  1223. health_status["dc022_score"]+ health_status["dc023_score"] + health_status["dc006s1_score"] + \
  1224. health_status["dc006s2_score"] + health_status["dc006s3_score"] + health_status["dc006s4_score"] + \
  1225. health_status["dc006s5_score"] + health_status["dc006s6_score"] + health_status["dc006s7_score"] + \
  1226. health_status["dc006s8_score"] + health_status["dc006s9_score"] + health_status["dc006s10_score"] + \
  1227. health_status["dc027s1_score"]+ health_status["dc027s2_score"]+ \
  1228. health_status["dc027s3_score"]+ health_status["dc027s4_score"]+ health_status["dc027s5_score"]+ \
  1229. health_status["dc027s6_score"]+ health_status["dc027s7_score"]+ health_status["dc027s8_score"]+ \
  1230. health_status["dc027s9_score"]+health_status["dc027s10_score"]+\
  1231. health_status["draw_score"]
  1232. #心理得分
  1233. health_status["dc009_score"] = health_status["dc009"]-1
  1234. health_status["dc010_score"] = health_status["dc010"]-1
  1235. health_status["dc011_score"] = health_status["dc011"]-1
  1236. health_status["dc012_score"] = health_status["dc012"]-1
  1237. health_status["dc013_score"] = 4 - health_status["dc013"]
  1238. health_status["dc014_score"] = health_status["dc014"]-1
  1239. health_status["dc015_score"] = health_status["dc015"]-1
  1240. health_status["dc016_score"] = 4 - health_status["dc016"]
  1241. health_status["dc017_score"] = health_status["dc017"]-1
  1242. health_status["dc018_score"] = health_status["dc018"]-1
  1243. data_2015["psychiatric_score"] = health_status["dc009_score"] + health_status["dc010_score"] + health_status["dc011_score"] + \
  1244. health_status["dc012_score"] + health_status["dc013_score"] + health_status["dc014_score"] + health_status["dc015_score"] + \
  1245. health_status["dc016_score"] + health_status["dc017_score"] + health_status["dc018_score"]
  1246. #睡眠状态
  1247. # (1)Rarely or none of the time (<1 day) 很少或者根本没有(<1天)
  1248. # (2)Some or a little of the time (1-2 days) 不太多(1-2天)
  1249. # (3)Occasionally or a moderate amount of the time (3-4 days) 有时或者说有一半的时间(3-4天)
  1250. # (4)Most or all of the time (5-7 days) 大多数的时间(5-7天)
  1251. data_2015["sleep_state"] = health_status['dc015']
  1252. #ADL
  1253. health_status["db010_score"] = health_status["db010"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
  1254. health_status["db011_score"] = health_status["db011"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
  1255. health_status["db012_score"] = health_status["db012"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
  1256. health_status["db013_score"] = health_status["db013"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
  1257. health_status["db014_score"] = health_status["db014"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
  1258. health_status["db015_score"] = health_status["db015"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
  1259. data_2015["ADL"] = health_status["db010_score"] + health_status["db011_score"] + health_status["db012_score"] + health_status["db013_score"] + \
  1260. health_status["db014_score"] + health_status["db015_score"]
  1261. # 是否有管道煤气或天然气?
  1262. houseing["Gas_Connection"] = houseing["i019"].apply(lambda x : 1 if x ==1 else 0 if x == 2 else np.nan)
  1263. # 是否带供暖设施(不包括土暖气和可制暖的空调)?
  1264. houseing["Heating_Facility"] = houseing["i020"].apply(lambda x : 1 if x ==1 else 0 if x == 2 else np.nan)
  1265. # 供暖所用的主要能源是什么?
  1266. # (1)Solar 太阳能
  1267. # (2)Coal 煤炭、蜂窝煤
  1268. # (3)Natural gas 管道天然气或煤气
  1269. # (4)Liquefied Petroleum Gas 液化石油气
  1270. # (5)Electric 电
  1271. # (6)Crop residue/Wood buring 秸秆、柴火
  1272. # (7)Other 其他
  1273. houseing["Heating_Energy"] = houseing["i021"]
  1274. # 做饭用的主要燃料是什么?
  1275. # (1)Coal 煤炭、蜂窝煤
  1276. # (2)Natural gas 管道天然气或煤气
  1277. # (3)Marsh gas 沼气
  1278. # (4)Liquefied Petroleum Gas 液化石油气
  1279. # (5)Electric 电
  1280. # (6)crop residue/Wood burning 秸秆、柴火
  1281. # (7)other 其他
  1282. houseing["Cooking_Fuel"] = houseing["i022"]
  1283. houseing_select = houseing[['ID','householdID', 'communityID','Gas_Connection',
  1284. 'Heating_Facility', 'Heating_Energy', 'Cooking_Fuel']]
  1285. data_2015 = pd.merge(data_2015, houseing_select, on = ["ID", 'householdID', 'communityID'], how="left")
  1286. data_2015["wave"] = year
  1287. change_columns(data_2015)
  1288. data_2015 = pd.concat([data_2013, data_2015], axis=0)
  1289. print("2015 complete")
  1290. # 2018年
  1291. year = "2018"
  1292. demo, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Demographic_Background.dta")
  1293. psu, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS2013/PSU.dta", encoding='gbk')
  1294. health_status, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Health_Status_and_Functioning.dta")
  1295. health_care, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Health_Care_and_Insurance.dta")
  1296. cognition, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Cognition.dta")
  1297. weight, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Sample_Infor.dta")
  1298. houseing, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Housing.dta")
  1299. #性别#年龄#婚姻状况
  1300. # 1 married or partnered
  1301. # 0 other marital status (separated, divorced, unmarried, or widowed)
  1302. demo["marital_status"] = demo.apply(lambda x : 1 if x["be001"]==1 or x["be001"]==2 or x["be002"]==1 else 0 if x["be001"] in [3,4,5,6] else np.nan, axis=1)
  1303. #教育
  1304. # 0 below high school
  1305. # 1 high school
  1306. # 2 college or above
  1307. #更新2015的教育
  1308. demo["education"] = demo.apply(lambda x : x["bd001_w2_4"] if not pd.isna(x["bd001_w2_4"]) else np.nan, axis=1)
  1309. demo["education"] = demo["education"].apply(lambda x : 1 if x == 6 or x == 7 else 2 if x in [8, 9, 10, 11] else 0 if x in [1,2,3,4,5] else np.nan)
  1310. # 出生年
  1311. demo["birth_year"] = demo.apply(lambda x : x["ba004_w3_1"] if x["ba005_w4"]==1 else x["ba002_1"] if x["ba005_w4"]==2 else np.nan, axis=1)
  1312. demo["birth_month"] = demo.apply(lambda x : x["ba004_w3_2"] if x["ba005_w4"]==1 else x["ba002_2"] if x["ba005_w4"]==2 else np.nan, axis=1)
  1313. #获取随访时间
  1314. demo = pd.merge(demo, weight[["ID", "iyear", "imonth"]], on = "ID", how="left")
  1315. data_2018 = demo[['ID','householdID', 'communityID','xrgender', 'birth_year','birth_month','ba003',"iyear", "imonth", 'marital_status', 'education']]
  1316. #居住地
  1317. # 0 农村
  1318. # 1 城市
  1319. data_2018 = pd.merge(data_2018, psu[['communityID', 'province', 'city', 'urban_nbs']], on = "communityID", how="left")
  1320. #身高#体重#收缩压#舒张压
  1321. data_2018[['qi002', 'ql002', 'waist','qa011' ,'qa012']]=np.nan
  1322. #受试者可以在不用手臂支撑的情况下按其平时的节奏连续起立坐下五次吗
  1323. data_2018[["Sit_Stand_5x_yes","Sit_Stand_5x_no", "Sit_Stand_5x_times", "Walking_Speed_Time_1", "Walking_Speed_Time_2"]] = np.nan
  1324. #白细胞(WBC),平均红血球容积MCV,血小板,血尿素氮bun,葡萄糖glu,血肌酐crea,总胆固醇cho,甘油三酯tg,高密度脂蛋白HDL,低密度脂蛋白胆固醇LDL,C反应蛋白CRP
  1325. #糖化血红蛋白hba1c,尿酸ua,血细胞比容Hematocrit,血红蛋白hgb,胱抑素C
  1326. data_2018[['bl_wbc','bl_mcv','bl_plt','bl_bun','bl_glu','bl_crea','bl_cho', 'bl_tg', 'bl_hdl', 'bl_ldl','bl_crp','bl_hbalc','bl_ua', 'bl_hct', 'bl_hgb','bl_cysc']]=np.nan
  1327. # 慢性病:
  1328. # (1) Hypertension 高血压病
  1329. # (2) Dyslipidemia (elevation of low density lipoprotein, triglycerides (TGs),and total cholesterol, or a low high density lipoprotein level)血脂异常(包括低密度脂蛋白、甘油三酯、总胆固醇的升高或(和)高密度脂蛋白的下降)
  1330. # (3) Diabetes or high blood sugar糖尿病或血糖升高(包括糖耐量异常和空腹血糖升高)
  1331. # (4) Cancer or malignant tumor (excluding minor skin cancers) 癌症等恶性肿瘤(不包括轻度皮肤癌)
  1332. # (5) Chronic lung diseases, such as chronic bronchitis , emphysema ( excluding tumors, or cancer) 慢性肺部疾患如慢性支气管炎或肺气肿、肺心病(不包括肿瘤或癌)
  1333. # (6) Liver disease (except fatty liver, tumors, and cancer) 肝脏疾病
  1334. # (除脂肪肝、肿瘤或癌外)
  1335. # (7) Heart attack, coronary heart disease, angina, congestive heart failure, or other heart problems 心脏病(如心肌梗塞、冠心病、心绞痛、充血性心力衰竭和其他心脏疾病)
  1336. # (8) Stroke 中风
  1337. # (9) Kidney disease (except for tumor or cancer) 肾脏疾病(不包括肿瘤或癌)
  1338. # (10) Stomach or other digestive disease (except for tumor or cancer) 胃部疾病或消化系统疾病(不包括肿瘤或癌)
  1339. # (11) Emotional, nervous, or psychiatric problems 情感及精神方面问题
  1340. # (12) Memory-related disease 与记忆相关的疾病 (如老年痴呆症、脑萎缩、帕金森症)
  1341. # (13) Arthritis or rheumatism 关节炎或风湿病
  1342. # (14) Asthma 哮喘
  1343. # 体力活动
  1344. # 2 vigorous (vigorous activity more than once a week)
  1345. # 1 moderate (moderate activity more than once a week)
  1346. # 0 inactive (the rest)
  1347. health_status["Physical_activity"] = health_status.apply(lambda x : 2 if x["da051_1_"]==1 else
  1348. 1 if x["da051_2_"]==1 else
  1349. 0 if x["da051_3_"] == 1 or (x["da051_1_"]==2 and x["da051_2_"]==2 and x["da051_3_"] == 2)
  1350. else np.nan ,axis=1)
  1351. # 抽烟
  1352. # 1 抽过烟
  1353. # 0 没有抽过烟
  1354. health_status["Smoke"] = health_status["da059"].apply(lambda x : 1 if x ==1 else 0 if x == 2 else 1)
  1355. # 喝酒
  1356. # 1 喝过酒
  1357. # 0 没有喝过酒
  1358. health_status["Drink"] = health_status.apply(lambda x : 1 if x["da067"] ==1 or x["da067"] ==2 else
  1359. 0 if x["da069"] == 1 else
  1360. 1 if x["da069"] == 2 or x["da069"] == 3 else np.nan, axis=1)
  1361. # 您是否经历过交通事故,或任何的重大意外伤害,并接受了治疗?
  1362. # 1 是
  1363. # 0 否
  1364. health_status['Accident_Or_Injury']=health_status.apply(lambda x : 1 if (not pd.isna(x["da021"]) and x["da021"]==1) or (pd.isna(x["da021"]) and not pd.isna(x["da022"]) )else
  1365. 0 if (not pd.isna(x["da021"]) and x["da021"]==2) or (pd.isna(x["da021"]) and pd.isna(x["da022"]) ) else np.nan, axis=1)
  1366. # 过去两年有没有摔倒?
  1367. # 1 是
  1368. # 0 否
  1369. health_status['Fell_In_Last2Years']=health_status.apply(lambda x : 1 if x["da023"] ==1 or x["da023_w4"]==1 else
  1370. 0 if x["da023"] ==2 or x["da023_w4"]==2 else np.nan, axis=1)
  1371. # 您什么时候开始来月经的?(year/age)
  1372. health_status['Menarche_Year']=health_status["da026_1"]
  1373. health_status['Menarche_Age']=health_status["da026_2"]
  1374. # 您什么时候开始绝经的?
  1375. health_status['Menopause_Year']=health_status["da028_1"]
  1376. health_status['Menopause_Age']=health_status["da028_2"]
  1377. # 第一次诊断出您有前列腺疾病是在什么时候?
  1378. health_status['Prostate_Issue_Year']=health_status["da030_1"]
  1379. health_status['Prostate_Issue_Age']=health_status["da030_2"]
  1380. # 是否戴眼镜(包括矫正视力镜片)?
  1381. # 1 是
  1382. # 0 否
  1383. # 2 失明
  1384. # 3 偶尔
  1385. health_status['Wear_Glasses']=health_status["da032"].apply(lambda x : 1 if x == 1 else 2 if x ==2 else 0 if x == 3 else 3 if x == 4 else np.nan)
  1386. # 过去一个月内,您平均每天晚上真正睡着的时间大约是几小时?(可能短于您在床上躺着的时间)
  1387. health_status['Average_Sleep_Hours']=health_status["da049"]
  1388. # 过去一个月内,您通常午睡多长时间?分钟
  1389. health_status['Average_Nap_Minutes']=health_status["da050"]
  1390. # 您通常每周有没有至少持续做激烈活动十分钟?
  1391. health_status['Vigorous_Activity_10Min']=health_status["da051_1_"]
  1392. # 您通常每周有没有至少持续做中等强度的体力活动十分钟?
  1393. health_status['Moderate_Effort_10Min']=health_status["da051_2_"]
  1394. # 您通常每周有没有至少持续走路十分钟?
  1395. health_status['Walking_10Min']=health_status["da051_3_"]
  1396. # 您通常每周有多少天做[激烈活动]至少十分钟?
  1397. health_status['Vigorous_Activity_Days']=health_status["da052_1_"]
  1398. # 您通常每周有多少天做[中等强度的体力活动]至少十分钟?
  1399. health_status['Moderate_Effort_Days']=health_status["da052_2_"]
  1400. # 您通常每周有多少天做[走路]至少十分钟?
  1401. health_status['Walking_Days']=health_status["da052_3_"]
  1402. # 在做[激烈活动]的这些天里,您一天花多少时间做[激烈活动] 2小时
  1403. health_status['Vigorous_Activity_2Hours_PerDay']=health_status["da053_1_"]
  1404. # 在做[中等强度的体力活动]的这些天里,您一天花多少时间做[中等强度的体力活动] 2小时
  1405. health_status['Moderate_Effort_2Hours_PerDay']=health_status["da053_2_"]
  1406. # 在做[走路]的这些天里,您一天花多少时间做[走路] 2小时
  1407. health_status['Walking_2Hours_PerDay']=health_status["da053_3_"]
  1408. # 在做[激烈活动]的这些天里,您一天花多少时间做[激烈活动] 30分钟
  1409. health_status['Vigorous_Activity_30Min_PerDay']=health_status["da054_1_"]
  1410. # 在做[中等强度的体力活动]的这些天里,您一天花多少时间做[中等强度的体力活动] 30分钟
  1411. health_status['Moderate_Effort_30Min_PerDay']=health_status["da054_2_"]
  1412. # 在做[走路]的这些天里,您一天花多少时间做[走路] 30分钟
  1413. health_status['Walking_30Min_PerDay']=health_status["da054_3_"]
  1414. # 在做[激烈活动]的这些天里,您一天花多少时间做[激烈活动] 4小时
  1415. health_status['Vigorous_Activity_4Hours_PerDay']=health_status["da055_1_"]
  1416. # 在做[中等强度的体力活动]的这些天里,您一天花多少时间做[中等强度的体力活动] 4小时
  1417. health_status['Moderate_Effort_4Hours_PerDay']=health_status["da055_2_"]
  1418. # 在做[走路]的这些天里,您一天花多少时间做[走路] 4小时
  1419. health_status['Walking_4Hours_PerDay']=health_status["da055_3_"]
  1420. # 活动的原因
  1421. # 1 工作需要
  1422. # 2 娱乐
  1423. # 3 体育锻炼
  1424. # 4 其他
  1425. health_status["Reason_For_Vigorous_Activity"]= health_status["da051_1_1_"]
  1426. health_status["Reason_For_Moderate_Effort"]= health_status["da051_1_2_"]
  1427. health_status["Reason_For_Walking"]= health_status["da051_1_3_"]
  1428. # 过去一个月是否进行了下列社交活动?
  1429. # (1) 串门、跟朋友交往
  1430. # (2) 打麻将、下棋、打牌、去社区活动室
  1431. # (3) 无偿向与您不住在一起的亲人、朋友或者邻居提供帮助
  1432. # (4) 去公园或者其他场所跳舞、健身、练气功等
  1433. # (5) 参加社团组织活动
  1434. # (6) 志愿者活动或者慈善活动/无偿照顾与您不住在一起的病人或残疾人
  1435. # (7) 上学或者参加培训课程
  1436. # (8)其他
  1437. # (9) 以上均没有
  1438. health_status["da056_s1"] = health_status.apply(lambda x: 1 if x["da056_s1"]==1 else 0, axis=1)
  1439. health_status["da056_s2"] = health_status.apply(lambda x: 2 if x["da056_s2"]==2 else 0, axis=1)
  1440. health_status["da056_s3"] = health_status.apply(lambda x: 3 if x["da056_s3"]==3 else 0, axis=1)
  1441. health_status["da056_s4"] = health_status.apply(lambda x: 4 if x["da056_s4"]==4 else 0, axis=1)
  1442. health_status["da056_s5"] = health_status.apply(lambda x: 5 if x["da056_s5"]==5 else 0, axis=1)
  1443. health_status["da056_s6"] = health_status.apply(lambda x: 6 if x["da056_s6"]==6 or x["da056_s7"]==7 else 0, axis=1)
  1444. health_status["da056_s7"] = health_status.apply(lambda x: 7 if x["da056_s8"]==8 else 0, axis=1)
  1445. health_status["da056_s8"] = health_status.apply(lambda x: 8 if x["da056_s9"]==9 or x["da056_s10"]==10 or x["da056_s11"]==11 else 0, axis=1)
  1446. health_status["da056_s9"] = health_status.apply(lambda x: 7 if x["da056_s12"]==12 else 0, axis=1)
  1447. # 过去一个月的活动频率
  1448. # (1) Almost daily 差不多每天
  1449. # (2) Almost every week 差不多每周
  1450. # (3) Not regularly 不经常
  1451. health_status["da057_6_"] = health_status.apply(lambda x: 1 if x["da057_6_"]==1 or x["da057_7_"]==1 else 2 if x["da057_6_"]==2 or x["da057_7_"]==2 else 3 if x["da057_6_"]==3 or x["da057_7_"]==3 else np.nan, axis=1)
  1452. health_status["da057_7_"] = health_status["da057_8_"]
  1453. health_status["da057_8_"] = health_status.apply(lambda x: 1 if x["da057_9_"]==1 or x["da057_10_"]==1 or x["da057_11_"]==1
  1454. else 2 if x["da057_9_"]==2 or x["da057_10_"]==2 or x["da057_11_"]==2
  1455. else 3 if x["da057_9_"]==3 or x["da057_10_"]==3 or x["da057_11_"]==3
  1456. else np.nan, axis=1)
  1457. # 过去一个月,您是否上网?
  1458. health_status["Internet_Usage_LastMonth"] = health_status["da056_s10"].apply(lambda x : 1 if x==10 else 0)
  1459. # 使用以下哪些工具上网?
  1460. # 1. Desktop computer 台式电脑
  1461. # 2. Laptop computer 笔记本电脑
  1462. # 3. Tablet computer 平板电脑(如 IPAD)
  1463. # 4. Cellphone 手机
  1464. # 5. Other devices 其他设备
  1465. health_status["Internet_Tools_Desktop_computer"] = health_status["da056_w3_s1"]
  1466. health_status["Internet_Tools_Laptop_computer"] = health_status["da056_w3_s2"]
  1467. health_status["Internet_Tools_Tablet_computer"] = health_status["da056_w3_s3"]
  1468. health_status["Internet_Tools_Cellphone"] = health_status["da056_w3_s4"]
  1469. health_status["Internet_Tools_Other"] = health_status["da056_w3_s5"]
  1470. # 上网一般做什么?
  1471. health_status["Internet_Purpose_Chat"] = health_status["da056_w4_1_s1"]
  1472. health_status["Internet_Purpose_news"] = health_status["da056_w4_1_s2"]
  1473. health_status["Internet_Purpose_videos"] = health_status["da056_w4_1_s3"]
  1474. health_status["Internet_Purpose_games"] = health_status["da056_w4_1_s4"]
  1475. health_status["Internet_Purpose_Financial"] = health_status["da056_w4_1_s5"]
  1476. health_status["Internet_Purpose_Others"] = health_status["da056_w4_1_s6"]
  1477. # 是否会用手机支付
  1478. # 1 是
  1479. # 0 否
  1480. health_status["Mobile_Payment"] = health_status["da056_w4_2"].apply(lambda x : 1 if x==1 else 0 if x==2 else np.nan)
  1481. # 是否使用微信?
  1482. health_status["Wechat_Usage"] = health_status["da056_w4_3"].apply(lambda x : 1 if x==1 else 0 if x==2 else np.nan)
  1483. # 发不发微信朋友圈?
  1484. health_status["Post_Moments"] = health_status["da056_w4_4"].apply(lambda x : 1 if x==1 else 0 if x==2 else np.nan)
  1485. # 现在还在吸烟还是戒烟了?
  1486. # 1 仍然抽烟
  1487. # 2 戒烟
  1488. health_status['Current_Smoking_Status']=health_status.apply(lambda x : 1 if x["da061"] == 1 or x["da061_w4"] == 1 else 2 if x["da061"] == 2 or x["da061_w4"] == 2 else np.nan, axis=1)
  1489. # 吸烟时,一般抽什么烟?
  1490. # (1) Smoking a pipe 用烟管吸烟(烟袋、旱烟)
  1491. # (2) Smoking self-rolled cigarettes 自己卷烟抽
  1492. # (3) Filtered cigarette带滤咀香烟
  1493. # (4) Unfiltered cigarette不带滤咀香烟
  1494. # (5) Cigar雪茄
  1495. # (6) Water cigarettes 水烟
  1496. health_status.loc[health_status['da060'] == 1, 'Smoking_Type_pipe'] = 1
  1497. health_status.loc[health_status['da060'] == 2, 'Smoking_Type_rolled'] = 2
  1498. health_status.loc[health_status['da060'] == 3, 'Smoking_Type_Filtered'] = 3
  1499. health_status.loc[health_status['da060'] == 4, 'Smoking_Type_Unfiltered'] = 4
  1500. health_status.loc[health_status['da060'] == 5, 'Smoking_Type_Cigar'] = 5
  1501. health_status.loc[health_status['da060'] == 6, 'Smoking_Type_Water'] = 6
  1502. # 现在/戒烟前平均一天抽多少支香烟?
  1503. health_status['Daily_Cigarette_Count']=health_status["da063"]
  1504. # 在过去的一年, 喝酒吗
  1505. # (1) Drink more than once a month. 喝酒,每月超过一次
  1506. # (2) Drink but less than once a month 喝酒,但每月少于一次
  1507. # (3) None of these 什么都不喝
  1508. health_status['Drink_PastYear']=health_status["da067"]
  1509. # 过去一年内 平均一个月喝几次酒
  1510. # (1)Once a month 每月一次
  1511. # (2)2-3 times a month 每月2-3次
  1512. # (3)Once a week 每周一次
  1513. # (4)2-3 times a week 每周2-3次
  1514. # (5)4-6 times a week 每周4-6次
  1515. # (6)Once a day 每天一次
  1516. # (7)Twice a day 一天两次
  1517. # (8)More than twice a day 一天超过两次
  1518. health_status['Drink_Monthly_Frequency']=health_status.apply(lambda x : 8 if x["da072"] ==8 or x["da074"] ==8 or x["da076"] ==8 else
  1519. 7 if x["da072"] ==7 or x["da074"] ==7 or x["da076"] ==7 else
  1520. 6 if x["da072"] ==6 or x["da074"] ==6 or x["da076"] ==6 else
  1521. 5 if x["da072"] ==5 or x["da074"] ==5 or x["da076"] ==5 else
  1522. 4 if x["da072"] ==4 or x["da074"] ==4 or x["da076"] ==4 else
  1523. 3 if x["da072"] ==3 or x["da074"] ==3 or x["da076"] ==3 else
  1524. 2 if x["da072"] ==2 or x["da074"] ==2 or x["da076"] ==2 else
  1525. 1 if x["da072"] ==1 or x["da074"] ==1 or x["da076"] ==1 else np.nan, axis=1)
  1526. # 自上次访问以来的两年内,您是否发作过心脏病?
  1527. # 1 是
  1528. # 0 否
  1529. health_status["Heart_attack_2_years"] = health_status.apply(lambda x : 1 if x["da007_w2_5"] ==1 else
  1530. 0 if x["da007_w2_5"] == 2 else np.nan, axis=1)
  1531. # 自上次访问以来,是否有医生诊断您中风复发?
  1532. # 1 是
  1533. # 0 否
  1534. health_status['Recurrent_Stroke']=health_status.apply(lambda x : 1 if x["da019_w2_1"] ==1 else
  1535. 0 if x["da019_w2_1"] == 2 else np.nan, axis=1)
  1536. columns_to_diseases_old = ['da007_1_', 'da007_2_','da007_3_','da007_4_','da007_5_','da007_6_','da007_7_','da007_8_','da007_9_','da007_10_','da007_11_'
  1537. ,'da007_12_','da007_13_','da007_14_']
  1538. columns_to_diseases_new = ['Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases',
  1539. 'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease',
  1540. 'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma']
  1541. for (col_old, col_new) in zip(columns_to_diseases_old,columns_to_diseases_new):
  1542. health_status[col_new] = health_status.apply(lambda x : x[col_old] if not pd.isna(x[col_old]) else np.nan, axis=1)
  1543. diseases_2015 = data_2015[data_2015["wave"]=="2015"][['ID','Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases',
  1544. 'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease',
  1545. 'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma']]
  1546. # 按 'ID' 列合并两个表
  1547. health_status = pd.merge(health_status, diseases_2015, on='ID', how='left', suffixes=("_2018","_2015"))
  1548. # 使用 fillna() 来更新字段
  1549. for col in columns_to_diseases_new:
  1550. health_status[col] = health_status[f'{col}_2018'].fillna(health_status[f'{col}_2015'])
  1551. health_status_select = health_status[['ID','householdID', 'communityID', 'Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases',
  1552. 'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease',
  1553. 'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma', "Physical_activity", "Smoke", "Drink", "Accident_Or_Injury", "Fell_In_Last2Years", "Menarche_Year", "Menarche_Age",
  1554. "Menopause_Year", "Menopause_Age", "Prostate_Issue_Year", "Prostate_Issue_Age", "Wear_Glasses" ,
  1555. "Average_Sleep_Hours", "Average_Nap_Minutes", "Vigorous_Activity_10Min", "Moderate_Effort_10Min"
  1556. , "Walking_10Min", "Vigorous_Activity_Days", "Moderate_Effort_Days", "Walking_Days", "Vigorous_Activity_2Hours_PerDay"
  1557. , "Moderate_Effort_2Hours_PerDay", "Walking_2Hours_PerDay", 'Vigorous_Activity_30Min_PerDay'
  1558. , "Moderate_Effort_30Min_PerDay", "Walking_30Min_PerDay", "Vigorous_Activity_4Hours_PerDay"
  1559. , "Moderate_Effort_4Hours_PerDay", "Walking_4Hours_PerDay", "Reason_For_Vigorous_Activity", "Reason_For_Moderate_Effort", "Reason_For_Walking"
  1560. , "da056_s1", "da056_s2", "da056_s3", "da056_s4", "da056_s5", "da056_s6", "da056_s7", "da056_s8", "da056_s9"
  1561. , "da057_1_" , "da057_2_", "da057_3_", "da057_4_", "da057_5_", "da057_6_", "da057_7_", "da057_8_"
  1562. , "Internet_Usage_LastMonth", "Internet_Tools_Desktop_computer","Internet_Tools_Laptop_computer", "Internet_Tools_Tablet_computer", "Internet_Tools_Cellphone", "Internet_Tools_Other"
  1563. , "Internet_Purpose_Chat", "Internet_Purpose_news", "Internet_Purpose_videos", "Internet_Purpose_games", "Internet_Purpose_Financial", "Internet_Purpose_Others"
  1564. , "Mobile_Payment", "Wechat_Usage", "Post_Moments", "Current_Smoking_Status"
  1565. , "Smoking_Type_pipe","Smoking_Type_rolled","Smoking_Type_Filtered","Smoking_Type_Unfiltered","Smoking_Type_Cigar","Smoking_Type_Water"
  1566. , "Daily_Cigarette_Count", "Drink_PastYear", "Drink_Monthly_Frequency","Heart_attack_2_years", "Recurrent_Stroke"]]
  1567. data_2018 = pd.merge(data_2018, health_status_select, on = ["ID", 'householdID', 'communityID'], how="left")
  1568. #计算认知功能得分,分成三部分:电话问卷9分,词语回忆10分、画图1分
  1569. cognition["dc001s1_score"] = cognition["dc001_w4"].apply(lambda x : 1 if x==1 else 0 if x==5 else np.nan)
  1570. cognition["dc001s2_score"] = cognition["dc006_w4"].apply(lambda x : 1 if x==1 else 0 if x==5 else np.nan)
  1571. cognition["dc001s3_score"] = cognition["dc003_w4"].apply(lambda x : 1 if x==1 else 0 if x==5 else np.nan)
  1572. cognition["dc002_score"] = cognition["dc005_w4"].apply(lambda x : 1 if x==1 else 0 if x==5 else np.nan)
  1573. # cognition["dc003_score"] = cognition["dc002_w4"].apply(lambda x : 1 if x==1 else 0 if x==5 else np.nan)
  1574. cognition["dc019_score"] = cognition.apply(lambda x : 0 if x["dc014_w4_1"]==97 else 1 if pd.isna(x["dc014_w4_1"]) and x["dc014_w4_1_1"]==93 else 0 if pd.isna(x["dc014_w4_1"]) and (not x["dc014_w4_1_1"]==93) else np.nan, axis=1)
  1575. cognition["dc020_score"] = cognition.apply(lambda x : 0 if x["dc014_w4_2"]==97 else 1 if pd.isna(x["dc014_w4_2"]) and x["dc014_w4_2_1"]==86 else 0 if pd.isna(x["dc014_w4_2"]) and (not x["dc014_w4_2_1"]==86) else np.nan, axis=1)
  1576. cognition["dc021_score"] = cognition.apply(lambda x : 0 if x["dc014_w4_3"]==97 else 1 if pd.isna(x["dc014_w4_3"]) and x["dc014_w4_3_1"]==79 else 0 if pd.isna(x["dc014_w4_3"]) and (not x["dc014_w4_3_1"]==79) else np.nan, axis=1)
  1577. cognition["dc022_score"] = cognition.apply(lambda x : 0 if x["dc014_w4_4"]==97 else 1 if pd.isna(x["dc014_w4_4"]) and x["dc014_w4_4_1"]==72 else 0 if pd.isna(x["dc014_w4_4"]) and (not x["dc014_w4_4_1"]==72) else np.nan, axis=1)
  1578. cognition["dc023_score"] = cognition.apply(lambda x : 0 if x["dc014_w4_5"]==97 else 1 if pd.isna(x["dc014_w4_5"]) and x["dc014_w4_5_1"]==65 else 0 if pd.isna(x["dc014_w4_5"]) and (not x["dc014_w4_5_1"]==65) else np.nan, axis=1)
  1579. #词语记忆
  1580. cognition["dc006s1_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s1"]==1 else 0, axis=1)
  1581. cognition["dc006s2_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s2"]==2 else 0, axis=1)
  1582. cognition["dc006s3_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s3"]==3 else 0, axis=1)
  1583. cognition["dc006s4_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s4"]==4 else 0, axis=1)
  1584. cognition["dc006s5_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s5"]==5 else 0, axis=1)
  1585. cognition["dc006s6_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s6"]==6 else 0, axis=1)
  1586. cognition["dc006s7_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s7"]==7 else 0, axis=1)
  1587. cognition["dc006s8_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s8"]==8 else 0, axis=1)
  1588. cognition["dc006s9_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s9"]==9 else 0, axis=1)
  1589. cognition["dc006s10_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s10"]==10 else 0, axis=1)
  1590. # cognition["dc006s11_score"] = cognition["dc028_w4_s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0)
  1591. cognition["dc027s1_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s1"]==1 else 0, axis=1)
  1592. cognition["dc027s2_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s2"]==2 else 0, axis=1)
  1593. cognition["dc027s3_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s3"]==3 else 0, axis=1)
  1594. cognition["dc027s4_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s4"]==4 else 0, axis=1)
  1595. cognition["dc027s5_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s5"]==5 else 0, axis=1)
  1596. cognition["dc027s6_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s6"]==6 else 0, axis=1)
  1597. cognition["dc027s7_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s7"]==7 else 0, axis=1)
  1598. cognition["dc027s8_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s8"]==8 else 0, axis=1)
  1599. cognition["dc027s9_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s9"]==9 else 0, axis=1)
  1600. cognition["dc027s10_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s10"]==10 else 0, axis=1)
  1601. # cognition["dc027s11_score"] = cognition["dc047_w4_s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0)
  1602. #画图
  1603. cognition["draw_score"] = cognition["dc024_w4"].apply(lambda x : 1 if x==1 else 0 if x==5 else np.nan)
  1604. data_2018["Cognition_score"] = cognition["dc001s1_score"] + cognition["dc001s2_score"] + \
  1605. cognition["dc001s3_score"] + cognition["dc002_score"]+ \
  1606. cognition["dc019_score"]+ cognition["dc020_score"] + cognition["dc021_score"]+ \
  1607. cognition["dc022_score"]+ cognition["dc023_score"] + cognition["dc006s1_score"] + \
  1608. cognition["dc006s2_score"] + cognition["dc006s3_score"] + cognition["dc006s4_score"] + \
  1609. cognition["dc006s5_score"] + cognition["dc006s6_score"] + cognition["dc006s7_score"] + \
  1610. cognition["dc006s8_score"] + cognition["dc006s9_score"] + cognition["dc006s10_score"] + \
  1611. cognition["dc027s1_score"]+ cognition["dc027s2_score"]+ \
  1612. cognition["dc027s3_score"]+ cognition["dc027s4_score"]+ cognition["dc027s5_score"]+ \
  1613. cognition["dc027s6_score"]+ cognition["dc027s7_score"]+ cognition["dc027s8_score"]+ \
  1614. cognition["dc027s9_score"]+cognition["dc027s10_score"]+\
  1615. cognition["draw_score"]
  1616. #心理得分
  1617. cognition["dc009_score"] = cognition["dc009"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)
  1618. cognition["dc010_score"] = cognition["dc010"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)
  1619. cognition["dc011_score"] = cognition["dc011"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)
  1620. cognition["dc012_score"] = cognition["dc012"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)
  1621. cognition["dc013_score"] = cognition["dc013"].apply(lambda x: 4-x if (not pd.isna(x)) and x <5 else np.nan)
  1622. cognition["dc014_score"] = cognition["dc014"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)
  1623. cognition["dc015_score"] = cognition["dc015"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)
  1624. cognition["dc016_score"] = cognition["dc016"].apply(lambda x: 4-x if (not pd.isna(x)) and x <5 else np.nan)
  1625. cognition["dc017_score"] = cognition["dc017"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)
  1626. cognition["dc018_score"] = cognition["dc018"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)
  1627. data_2018["psychiatric_score"] = cognition["dc009_score"] + cognition["dc010_score"] + cognition["dc011_score"] + \
  1628. cognition["dc012_score"] + cognition["dc013_score"] + cognition["dc014_score"] + cognition["dc015_score"] + \
  1629. cognition["dc016_score"] + cognition["dc017_score"] + cognition["dc018_score"]
  1630. #睡眠状态
  1631. # (1)Rarely or none of the time (<1 day) 很少或者根本没有(<1天)
  1632. # (2)Some or a little of the time (1-2 days) 不太多(1-2天)
  1633. # (3)Occasionally or a moderate amount of the time (3-4 days) 有时或者说有一半的时间(3-4天)
  1634. # (4)Most or all of the time (5-7 days) 大多数的时间(5-7天)
  1635. data_2018["sleep_state"] = cognition['dc015'].apply(lambda x : np.nan if x > 4 else x)
  1636. #ADL
  1637. health_status["db010_score"] = health_status["db010"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
  1638. health_status["db011_score"] = health_status["db011"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
  1639. health_status["db012_score"] = health_status["db012"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
  1640. health_status["db013_score"] = health_status["db013"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
  1641. health_status["db014_score"] = health_status["db014"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
  1642. health_status["db015_score"] = health_status["db015"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
  1643. data_2018["ADL"] = health_status["db010_score"] + health_status["db011_score"] + health_status["db012_score"] + health_status["db013_score"] + \
  1644. health_status["db014_score"] + health_status["db015_score"]
  1645. # 是否有管道煤气或天然气?
  1646. houseing["Gas_Connection"] = houseing["i019"].apply(lambda x : 1 if x ==1 else 0 if x == 2 else np.nan)
  1647. # 是否带供暖设施(不包括土暖气和可制暖的空调)?
  1648. houseing["Heating_Facility"] = houseing["i020"].apply(lambda x : 1 if x ==1 else 0 if x == 2 else np.nan)
  1649. # 供暖所用的主要能源是什么?
  1650. # (1)Solar 太阳能
  1651. # (2)Coal 煤炭、蜂窝煤
  1652. # (3)Natural gas 管道天然气或煤气
  1653. # (4)Liquefied Petroleum Gas 液化石油气
  1654. # (5)Electric 电
  1655. # (6)Crop residue/Wood buring 秸秆、柴火
  1656. # (7)Other 其他
  1657. houseing["Heating_Energy"] = houseing["i021_w4"].apply(lambda x : 7 if x==8 else x )
  1658. # 做饭用的主要燃料是什么?
  1659. # (1)Coal 煤炭、蜂窝煤
  1660. # (2)Natural gas 管道天然气或煤气
  1661. # (3)Marsh gas 沼气
  1662. # (4)Liquefied Petroleum Gas 液化石油气
  1663. # (5)Electric 电
  1664. # (6)crop residue/Wood burning 秸秆、柴火
  1665. # (7)other 其他
  1666. houseing["Cooking_Fuel"] = houseing["i022_w4"].apply(lambda x : np.nan if x==8 else x )
  1667. houseing_select = houseing[['householdID', 'communityID','Gas_Connection',
  1668. 'Heating_Facility', 'Heating_Energy', 'Cooking_Fuel']]
  1669. data_2018 = pd.merge(data_2018, houseing_select, on = ['householdID', 'communityID'], how="left")
  1670. data_2018["wave"] = year
  1671. change_columns(data_2018)
  1672. data_2018 = pd.concat([data_2015, data_2018], axis=0)
  1673. print("2018 complete")
  1674. # 2020年
  1675. year = "2020"
  1676. demo, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Demographic_Background.dta")
  1677. psu, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS2013/PSU.dta", encoding='gbk')
  1678. health_status, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Health_Status_and_Functioning.dta")
  1679. weight, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Sample_Infor.dta")
  1680. houseing, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Household_Income.dta")
  1681. #性别#年龄#婚姻状况
  1682. # 1 married or partnered
  1683. # 0 other marital status (separated, divorced, unmarried, or widowed)
  1684. demo["marital_status"] = demo.apply(lambda x : 1 if x["ba011"]==1 or x["ba011"]==2 or x["ba012"]==1 else 0 if x["ba011"] in [3,4,5,6] else np.nan, axis=1)
  1685. #教育
  1686. # 0 below high school
  1687. # 1 high school
  1688. # 2 college or above
  1689. demo["education"] = demo.apply(lambda x : x["ba010"] if not pd.isna(x["ba010"]) else np.nan, axis=1)
  1690. demo["education"] = demo["education"].apply(lambda x : 1 if x == 6 or x == 7 else 2 if x in [8, 9, 10, 11] else 0 if x in [1,2,3,4,5] else np.nan)
  1691. #合并2018年的教育
  1692. eductaion_2018 = data_2018[data_2018["wave"]=="2018"][['ID',"education"]]
  1693. # 按 'ID' 列合并两个表
  1694. demo = pd.merge(demo, eductaion_2018, on='ID', how='left', suffixes=("_2020","_2018"))
  1695. # 使用 fillna() 来更新字段
  1696. demo['education'] = demo['education_2020'].fillna(demo['education_2018'])
  1697. # 出生年
  1698. demo["birth_year"] = demo.apply(lambda x : x["ba003_1"] if pd.isna(x["ba003_1"]) else np.nan, axis=1)
  1699. demo["birth_month"] = demo.apply(lambda x : x["ba003_2"] if pd.isna(x["ba003_2"]) else np.nan, axis=1)
  1700. #合并2018年的出生年
  1701. birth_year_2018 = data_2018[data_2018["wave"]=="2018"][['ID',"birth_year", "birth_month"]]
  1702. # 按 'ID' 列合并两个表
  1703. demo = pd.merge(demo, birth_year_2018, on='ID', how='left', suffixes=("_2020","_2018"))
  1704. # 使用 fillna() 来更新字段
  1705. demo['birth_year'] = demo['birth_year_2020'].fillna(demo['birth_year_2018'])
  1706. demo['birth_month'] = demo['birth_month_2020'].fillna(demo['birth_month_2018'])
  1707. #获取随访时间
  1708. demo = pd.merge(demo, weight[["ID", "iyear", "imonth"]], on = "ID", how="left")
  1709. demo["ba003"] = 1
  1710. data_2020 = demo[['ID','householdID', 'communityID','xrgender', 'birth_year','birth_month','ba003',"iyear", "imonth", 'marital_status', 'education']]
  1711. #居住地
  1712. # 0 农村
  1713. # 1 城市
  1714. data_2020 = pd.merge(data_2020, psu[['communityID', 'province', 'city', 'urban_nbs']], on = "communityID", how="left")
  1715. #身高#体重#收缩压#舒张压
  1716. data_2020[['qi002', 'ql002', 'waist', 'Systolic','Diastolic']]=np.nan
  1717. #受试者可以在不用手臂支撑的情况下按其平时的节奏连续起立坐下五次吗
  1718. data_2020[["Sit_Stand_5x_yes","Sit_Stand_5x_no", "Sit_Stand_5x_times", "Walking_Speed_Time_1", "Walking_Speed_Time_2"]] = np.nan
  1719. #白细胞(WBC),平均红血球容积MCV,血小板,血尿素氮bun,葡萄糖glu,血肌酐crea,总胆固醇cho,甘油三酯tg,高密度脂蛋白HDL,低密度脂蛋白胆固醇LDL,C反应蛋白CRP
  1720. #糖化血红蛋白hba1c,尿酸ua,血细胞比容Hematocrit,血红蛋白hgb,胱抑素C
  1721. data_2020[['bl_wbc','bl_mcv','bl_plt','bl_bun','bl_glu','bl_crea','bl_cho', 'bl_tg', 'bl_hdl', 'bl_ldl','bl_crp','bl_hbalc','bl_ua', 'bl_hct', 'bl_hgb','bl_cysc']]=np.nan
  1722. # 慢性病:
  1723. # (1) Hypertension 高血压病
  1724. # (2) Dyslipidemia (elevation of low density lipoprotein, triglycerides (TGs),and total cholesterol, or a low high density lipoprotein level)血脂异常(包括低密度脂蛋白、甘油三酯、总胆固醇的升高或(和)高密度脂蛋白的下降)
  1725. # (3) Diabetes or high blood sugar糖尿病或血糖升高(包括糖耐量异常和空腹血糖升高)
  1726. # (4) Cancer or malignant tumor (excluding minor skin cancers) 癌症等恶性肿瘤(不包括轻度皮肤癌)
  1727. # (5) Chronic lung diseases, such as chronic bronchitis , emphysema ( excluding tumors, or cancer) 慢性肺部疾患如慢性支气管炎或肺气肿、肺心病(不包括肿瘤或癌)
  1728. # (6) Liver disease (except fatty liver, tumors, and cancer) 肝脏疾病
  1729. # (除脂肪肝、肿瘤或癌外)
  1730. # (7) Heart attack, coronary heart disease, angina, congestive heart failure, or other heart problems 心脏病(如心肌梗塞、冠心病、心绞痛、充血性心力衰竭和其他心脏疾病)
  1731. # (8) Stroke 中风
  1732. # (9) Kidney disease (except for tumor or cancer) 肾脏疾病(不包括肿瘤或癌)
  1733. # (10) Stomach or other digestive disease (except for tumor or cancer) 胃部疾病或消化系统疾病(不包括肿瘤或癌)
  1734. # (11) Emotional, nervous, or psychiatric problems 情感及精神方面问题
  1735. # (12) Memory-related disease 与记忆相关的疾病 (如老年痴呆症、脑萎缩、帕金森症)
  1736. # (13) Arthritis or rheumatism 关节炎或风湿病
  1737. # (14) Asthma 哮喘
  1738. # 2020年把帕金森和记忆病症分开,需要和以前对齐
  1739. # 体力活动
  1740. # 2 vigorous (vigorous activity more than once a week)
  1741. # 1 moderate (moderate activity more than once a week)
  1742. # 0 inactive (the rest)
  1743. health_status["Physical_activity"] = health_status.apply(lambda x : 2 if x["da032_1_"]==1 else
  1744. 1 if x["da032_2_"]==1 else
  1745. 0 if x["da032_3_"] == 1 or (x["da032_1_"]==2 and x["da032_2_"]==2 and x["da032_3_"] == 2)
  1746. else np.nan ,axis=1)
  1747. # 抽烟
  1748. # 1 抽过烟
  1749. # 0 没有抽过烟
  1750. health_status["Smoke"] = health_status["da046"].apply(lambda x : 1 if x ==1 else 0 if x == 2 else 1)
  1751. # 喝酒
  1752. # 1 喝过酒
  1753. # 0 没有喝过酒
  1754. health_status["Drink"] = health_status.apply(lambda x : 1 if x["da051"] ==1 or x["da051"] ==2 else
  1755. 0 if x["da051"] == 3 else np.nan, axis=1)
  1756. # 您是否经历过交通事故,或任何的重大意外伤害,并接受了治疗?
  1757. # 1 是
  1758. # 0 否
  1759. health_status['Accident_Or_Injury']=health_status.apply(lambda x : 1 if x["da019"] ==1 or x["da020"]==1 else
  1760. 0 if x["da019"] ==2 or x["da020"]==2 else np.nan, axis=1)
  1761. # 过去两年有没有摔倒?
  1762. # 1 是
  1763. # 0 否
  1764. health_status['Fell_In_Last2Years']=health_status.apply(lambda x : 1 if x["da022"] ==1 or x["da023"]==1 else
  1765. 0 if x["da022"] ==2 or x["da023"]==2 else np.nan, axis=1)
  1766. # 您什么时候开始来月经的?(year/age)
  1767. health_status[['Menarche_Year']]=np.nan
  1768. health_status[['Menarche_Age']]=np.nan
  1769. # 您什么时候开始绝经的?
  1770. health_status[['Menopause_Year']]=np.nan
  1771. health_status[['Menopause_Age']]=np.nan
  1772. # 第一次诊断出您有前列腺疾病是在什么时候?
  1773. health_status[['Prostate_Issue_Year']]=np.nan
  1774. health_status[['Prostate_Issue_Age']]=np.nan
  1775. # 是否戴眼镜(包括矫正视力镜片)?
  1776. # 1 是
  1777. # 0 否
  1778. # 2 失明
  1779. # 3 偶尔
  1780. health_status[['Wear_Glasses']]=np.nan
  1781. # 过去一个月内,您平均每天晚上真正睡着的时间大约是几小时?(可能短于您在床上躺着的时间)
  1782. health_status['Average_Sleep_Hours']=health_status["da030"]
  1783. # 过去一个月内,您通常午睡多长时间?分钟
  1784. health_status['Average_Nap_Minutes']=health_status["da031"]
  1785. # 您通常每周有没有至少持续做激烈活动十分钟?
  1786. health_status['Vigorous_Activity_10Min']=health_status["da032_1_"]
  1787. # 您通常每周有没有至少持续做中等强度的体力活动十分钟?
  1788. health_status['Moderate_Effort_10Min']=health_status["da032_2_"]
  1789. # 您通常每周有没有至少持续走路十分钟?
  1790. health_status['Walking_10Min']=health_status["da032_3_"]
  1791. # 您通常每周有多少天做[激烈活动]至少十分钟?
  1792. health_status['Vigorous_Activity_Days']=health_status["da033_1_"]
  1793. # 您通常每周有多少天做[中等强度的体力活动]至少十分钟?
  1794. health_status['Moderate_Effort_Days']=health_status["da033_2_"]
  1795. # 您通常每周有多少天做[走路]至少十分钟?
  1796. health_status['Walking_Days']=health_status["da033_3_"]
  1797. # 在做[激烈活动]的这些天里,您一天花多少时间做[激烈活动] 2小时
  1798. health_status['Vigorous_Activity_2Hours_PerDay']=health_status["da034_1_"]
  1799. # 在做[中等强度的体力活动]的这些天里,您一天花多少时间做[中等强度的体力活动] 2小时
  1800. health_status['Moderate_Effort_2Hours_PerDay']=health_status["da034_2_"]
  1801. # 在做[走路]的这些天里,您一天花多少时间做[走路] 2小时
  1802. health_status['Walking_2Hours_PerDay']=health_status["da034_3_"]
  1803. # 在做[激烈活动]的这些天里,您一天花多少时间做[激烈活动] 30分钟
  1804. health_status['Vigorous_Activity_30Min_PerDay']=health_status["da035_1_"]
  1805. # 在做[中等强度的体力活动]的这些天里,您一天花多少时间做[中等强度的体力活动] 30分钟
  1806. health_status['Moderate_Effort_30Min_PerDay']=health_status["da035_2_"]
  1807. # 在做[走路]的这些天里,您一天花多少时间做[走路] 30分钟
  1808. health_status['Walking_30Min_PerDay']=health_status["da035_3_"]
  1809. # 在做[激烈活动]的这些天里,您一天花多少时间做[激烈活动] 4小时
  1810. health_status['Vigorous_Activity_4Hours_PerDay']=health_status["da036_1_"]
  1811. # 在做[中等强度的体力活动]的这些天里,您一天花多少时间做[中等强度的体力活动] 4小时
  1812. health_status['Moderate_Effort_4Hours_PerDay']=health_status["da036_2_"]
  1813. # 在做[走路]的这些天里,您一天花多少时间做[走路] 4小时
  1814. health_status['Walking_4Hours_PerDay']=health_status["da036_3_"]
  1815. # 活动的原因
  1816. # 1 工作需要
  1817. # 2 娱乐
  1818. # 3 体育锻炼
  1819. # 4 其他
  1820. health_status["Reason_For_Vigorous_Activity"]= health_status["da037_1_"]
  1821. health_status["Reason_For_Moderate_Effort"]= health_status["da037_2_"]
  1822. health_status["Reason_For_Walking"]= health_status["da037_3_"]
  1823. # 过去一个月是否进行了下列社交活动?
  1824. # (1) 串门、跟朋友交往
  1825. # (2) 打麻将、下棋、打牌、去社区活动室
  1826. # (3) 无偿向与您不住在一起的亲人、朋友或者邻居提供帮助
  1827. # (4) 去公园或者其他场所跳舞、健身、练气功等
  1828. # (5) 参加社团组织活动
  1829. # (6) 志愿者活动或者慈善活动/无偿照顾与您不住在一起的病人或残疾人
  1830. # (7) 上学或者参加培训课程
  1831. # (8)其他
  1832. # (9) 以上均没有
  1833. # 过去一个月的活动频率
  1834. # (1) Almost daily 差不多每天
  1835. # (2) Almost every week 差不多每周
  1836. # (3) Not regularly 不经常
  1837. # 过去一个月,您是否上网?
  1838. # 1 是
  1839. # 0 否
  1840. health_status["Internet_Usage_LastMonth"] = health_status["da040"].apply(lambda x : 1 if x ==1 else 0)
  1841. # 使用以下哪些工具上网?
  1842. # 1. Desktop computer 台式电脑
  1843. # 2. Laptop computer 笔记本电脑
  1844. # 3. Tablet computer 平板电脑(如 IPAD)
  1845. # 4. Cellphone 手机
  1846. # 5. Other devices 其他设备
  1847. health_status["Internet_Tools_Desktop_computer"] = health_status["da041_s1"]
  1848. health_status["Internet_Tools_Laptop_computer"] = health_status["da041_s2"]
  1849. health_status["Internet_Tools_Tablet_computer"] = health_status["da041_s3"]
  1850. health_status["Internet_Tools_Cellphone"] = health_status["da041_s4"]
  1851. health_status["Internet_Tools_Other"] = health_status["da041_s5"]
  1852. # 上网一般做什么?
  1853. health_status["Internet_Purpose_Chat"] = health_status["da042_s1"]
  1854. health_status["Internet_Purpose_news"] = health_status["da042_s2"]
  1855. health_status["Internet_Purpose_videos"] = health_status["da042_s3"]
  1856. health_status["Internet_Purpose_games"] = health_status["da042_s4"]
  1857. health_status["Internet_Purpose_Financial"] = health_status["da042_s5"]
  1858. health_status["Internet_Purpose_Others"] = health_status["da042_s6"]
  1859. # 是否会用手机支付
  1860. # 1 是
  1861. # 0 否
  1862. health_status["Mobile_Payment"] = health_status["da043"].apply(lambda x : 1 if x==1 else 0 if x==2 else np.nan)
  1863. # 是否使用微信?
  1864. health_status["Wechat_Usage"] = health_status["da044"].apply(lambda x : 1 if x==1 else 0 if x==2 else np.nan)
  1865. # 发不发微信朋友圈?
  1866. health_status["Post_Moments"] = health_status["da045"].apply(lambda x : 1 if x==1 else 0 if x==2 else np.nan)
  1867. # 现在还在吸烟还是戒烟了?
  1868. # 1 仍然抽烟
  1869. # 2 戒烟
  1870. health_status['Current_Smoking_Status']=health_status.apply(lambda x : 1 if x["da047"] == 1 else 2 if x["da047"] == 2 else np.nan, axis=1)
  1871. # 吸烟时,一般抽什么烟?
  1872. # (1) Smoking a pipe 用烟管吸烟(烟袋、旱烟)
  1873. # (2) Smoking self-rolled cigarettes 自己卷烟抽
  1874. # (3) Filtered cigarette带滤咀香烟
  1875. # (4) Unfiltered cigarette不带滤咀香烟
  1876. # (5) Cigar雪茄
  1877. # (6) Water cigarettes 水烟
  1878. health_status.loc[health_status['da048'] == 1, 'Smoking_Type_pipe'] = 1
  1879. health_status.loc[health_status['da048'] == 2, 'Smoking_Type_rolled'] = 2
  1880. health_status.loc[health_status['da048'] == 3, 'Smoking_Type_Filtered'] = 3
  1881. health_status.loc[health_status['da048'] == 4, 'Smoking_Type_Unfiltered'] = 4
  1882. health_status.loc[health_status['da048'] == 5, 'Smoking_Type_Cigar'] = 5
  1883. health_status.loc[health_status['da048'] == 6, 'Smoking_Type_Water'] = 6
  1884. # 现在/戒烟前平均一天抽多少支香烟?
  1885. health_status['Daily_Cigarette_Count']=health_status.apply(lambda x : x["da050_1"] if not pd.isna(x["da050_1"]) else x["da050_2"] if not pd.isna(x["da050_2"]) else np.nan, axis=1)
  1886. # 在过去的一年, 喝酒吗
  1887. # (1) Drink more than once a month. 喝酒,每月超过一次
  1888. # (2) Drink but less than once a month 喝酒,但每月少于一次
  1889. # (3) None of these 什么都不喝
  1890. health_status['Drink_PastYear']=health_status["da051"]
  1891. # 过去一年内 平均一个月喝几次酒
  1892. # (1)Once a month 每月一次
  1893. # (2)2-3 times a month 每月2-3次
  1894. # (3)Once a week 每周一次
  1895. # (4)2-3 times a week 每周2-3次
  1896. # (5)4-6 times a week 每周4-6次
  1897. # (6)Once a day 每天一次
  1898. # (7)Twice a day 一天两次
  1899. # (8)More than twice a day 一天超过两次
  1900. health_status['Drink_Monthly_Frequency']=health_status["da052"]
  1901. health_status['da003_12_'] = health_status.apply(process_row, axis=1)
  1902. columns_to_diseases_old = ['da003_1_', 'da003_2_','da003_3_','da003_4_','da003_5_','da003_6_','da003_7_','da003_8_','da003_9_','da003_10_','da003_11_'
  1903. ,'da003_12_','da003_14_','da003_15_']
  1904. columns_to_diseases_new = ['Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases',
  1905. 'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease',
  1906. 'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma']
  1907. for (col_old, col_new) in zip(columns_to_diseases_old,columns_to_diseases_new):
  1908. health_status[col_new] = health_status.apply(lambda x : x[col_old] if not pd.isna(x[col_old]) else np.nan, axis=1)
  1909. diseases_2018 = data_2018[data_2018["wave"]=="2018"][['ID','Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases',
  1910. 'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease',
  1911. 'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma']]
  1912. # 按 'ID' 列合并两个表
  1913. health_status = pd.merge(health_status, diseases_2018, on='ID', how='left', suffixes=("_2020","_2018"))
  1914. # 使用 fillna() 来更新字段
  1915. for col in columns_to_diseases_new:
  1916. health_status[col] = health_status[f'{col}_2020'].fillna(health_status[f'{col}_2018'])
  1917. health_status_select = health_status[['ID','householdID', 'communityID', 'Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases',
  1918. 'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease',
  1919. 'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma', "Physical_activity", "Smoke", "Drink", "Accident_Or_Injury", "Fell_In_Last2Years", "Menarche_Year", "Menarche_Age",
  1920. "Menopause_Year", "Menopause_Age", "Prostate_Issue_Year", "Prostate_Issue_Age", "Wear_Glasses",
  1921. "Average_Sleep_Hours", "Average_Nap_Minutes", "Vigorous_Activity_10Min", "Moderate_Effort_10Min"
  1922. , "Walking_10Min", "Vigorous_Activity_Days", "Moderate_Effort_Days", "Walking_Days", "Vigorous_Activity_2Hours_PerDay"
  1923. , "Moderate_Effort_2Hours_PerDay", "Walking_2Hours_PerDay", 'Vigorous_Activity_30Min_PerDay'
  1924. , "Moderate_Effort_30Min_PerDay", "Walking_30Min_PerDay", "Vigorous_Activity_4Hours_PerDay"
  1925. , "Moderate_Effort_4Hours_PerDay", "Walking_4Hours_PerDay", "Reason_For_Vigorous_Activity", "Reason_For_Moderate_Effort", "Reason_For_Walking"
  1926. , "da038_s1", "da038_s2", "da038_s3", "da038_s4", "da038_s5", "da038_s6", "da038_s7", "da038_s8", "da038_s9"
  1927. , "da039_1_" , "da039_2_", "da039_3_", "da039_4_", "da039_5_", "da039_6_", "da039_7_", "da039_8_"
  1928. , "Internet_Usage_LastMonth", "Internet_Tools_Desktop_computer","Internet_Tools_Laptop_computer", "Internet_Tools_Tablet_computer", "Internet_Tools_Cellphone", "Internet_Tools_Other"
  1929. , "Internet_Purpose_Chat", "Internet_Purpose_news", "Internet_Purpose_videos", "Internet_Purpose_games", "Internet_Purpose_Financial", "Internet_Purpose_Others"
  1930. , "Mobile_Payment", "Wechat_Usage", "Post_Moments", "Current_Smoking_Status"
  1931. , "Smoking_Type_pipe","Smoking_Type_rolled","Smoking_Type_Filtered","Smoking_Type_Unfiltered","Smoking_Type_Cigar","Smoking_Type_Water"
  1932. , "Daily_Cigarette_Count", "Drink_PastYear", "Drink_Monthly_Frequency",]]
  1933. data_2020 = pd.merge(data_2020, health_status_select, on = ["ID", 'householdID', 'communityID'], how="left")
  1934. # 自上次访问以来的两年内,您是否发作过心脏病?
  1935. # 1 是
  1936. # 0 否
  1937. data_2020[['Heart_attack_2_years']]=np.nan
  1938. # 自上次访问以来,是否有医生诊断您中风复发?
  1939. # 1 是
  1940. # 0 否
  1941. data_2020[['Recurrent_Stroke']]=np.nan
  1942. #计算认知功能得分,分成三部分:电话问卷9分,词语回忆10分、画图1分
  1943. health_status["dc001s1_score"] = health_status["dc001"].apply(lambda x : 1 if x==1 else 0 if x==2 else np.nan)
  1944. health_status["dc001s2_score"] = health_status["dc005"].apply(lambda x : 1 if x==1 else 0 if x==2 else np.nan)
  1945. health_status["dc001s3_score"] = health_status["dc003"].apply(lambda x : 1 if x==1 else 0 if x==2 else np.nan)
  1946. health_status["dc002_score"] = health_status["dc004"].apply(lambda x : 1 if x==1 else 0 if x==2 else np.nan)
  1947. health_status["dc003_score"] = health_status["dc002"].apply(lambda x : 1 if x==1 else 0 if x==2 else np.nan)
  1948. health_status["dc019_score"] = health_status.apply(lambda x : 0 if x["dc007_1"]==997 else 1 if x["dc007_1"] ==1 and x["dc007_1_1"]==93 else 0 if x["dc007_1"] ==1 and (not x["dc007_1_1"]==93) else np.nan, axis=1)
  1949. health_status["dc020_score"] = health_status.apply(lambda x : 0 if x["dc007_2"]==997 else 1 if x["dc007_2"] ==1 and x["dc007_2_1"]==86 else 0 if x["dc007_2"] ==1 and (not x["dc007_2_1"]==86) else np.nan, axis=1)
  1950. health_status["dc021_score"] = health_status.apply(lambda x : 0 if x["dc007_3"]==997 else 1 if x["dc007_3"] ==1 and x["dc007_3_1"]==79 else 0 if x["dc007_3"] ==1 and (not x["dc007_3_1"]==79) else np.nan, axis=1)
  1951. health_status["dc022_score"] = health_status.apply(lambda x : 0 if x["dc007_4"]==997 else 1 if x["dc007_4"] ==1 and x["dc007_4_1"]==72 else 0 if x["dc007_4"] ==1 and (not x["dc007_4_1"]==72) else np.nan, axis=1)
  1952. health_status["dc023_score"] = health_status.apply(lambda x : 0 if x["dc007_5"]==997 else 1 if x["dc007_5"] ==1 and x["dc007_5_1"]==65 else 0 if x["dc007_5"] ==1 and (not x["dc007_5_1"]==65) else np.nan, axis=1)
  1953. #词语记忆
  1954. health_status["dc006s1_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc012_s1"]==1 else 0, axis=1)
  1955. health_status["dc006s2_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc012_s2"]==2 else 0, axis=1)
  1956. health_status["dc006s3_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc012_s3"]==3 else 0, axis=1)
  1957. health_status["dc006s4_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc012_s4"]==4 else 0, axis=1)
  1958. health_status["dc006s5_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc012_s5"]==5 else 0, axis=1)
  1959. health_status["dc006s6_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc012_s6"]==6 else 0, axis=1)
  1960. health_status["dc006s7_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc012_s7"]==7 else 0, axis=1)
  1961. health_status["dc006s8_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc012_s8"]==8 else 0, axis=1)
  1962. health_status["dc006s9_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc012_s9"]==9 else 0, axis=1)
  1963. health_status["dc006s10_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc012_s10"]==10 else 0, axis=1)
  1964. health_status["dc027s1_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc028_s1"]==1 else 0, axis=1)
  1965. health_status["dc027s2_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc028_s2"]==2 else 0, axis=1)
  1966. health_status["dc027s3_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc028_s3"]==3 else 0, axis=1)
  1967. health_status["dc027s4_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc028_s4"]==4 else 0, axis=1)
  1968. health_status["dc027s5_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc028_s5"]==5 else 0, axis=1)
  1969. health_status["dc027s6_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc028_s6"]==6 else 0, axis=1)
  1970. health_status["dc027s7_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc028_s7"]==7 else 0, axis=1)
  1971. health_status["dc027s8_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc028_s8"]==8 else 0, axis=1)
  1972. health_status["dc027s9_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc028_s9"]==9 else 0, axis=1)
  1973. health_status["dc027s10_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc028_s10"]==10 else 0, axis=1)
  1974. #画图
  1975. health_status["draw_score"] = health_status["dc009"].apply(lambda x : 1 if x==1 else 0 if x==2 else np.nan)
  1976. data_2020["Cognition_score"] = health_status["dc001s1_score"] + health_status["dc001s2_score"] + \
  1977. health_status["dc001s3_score"] + health_status["dc002_score"]+ \
  1978. health_status["dc019_score"]+ health_status["dc020_score"] + health_status["dc021_score"]+ \
  1979. health_status["dc022_score"]+ health_status["dc023_score"] + health_status["dc006s1_score"] + \
  1980. health_status["dc006s2_score"] + health_status["dc006s3_score"] + health_status["dc006s4_score"] + \
  1981. health_status["dc006s5_score"] + health_status["dc006s6_score"] + health_status["dc006s7_score"] + \
  1982. health_status["dc006s8_score"] + health_status["dc006s9_score"] + health_status["dc006s10_score"] + \
  1983. health_status["dc027s1_score"]+ health_status["dc027s2_score"]+ \
  1984. health_status["dc027s3_score"]+ health_status["dc027s4_score"]+ health_status["dc027s5_score"]+ \
  1985. health_status["dc027s6_score"]+ health_status["dc027s7_score"]+ health_status["dc027s8_score"]+ \
  1986. health_status["dc027s9_score"]+health_status["dc027s10_score"]+\
  1987. health_status["draw_score"]
  1988. #心理得分
  1989. health_status["dc009_score"] = health_status["dc016"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)
  1990. health_status["dc010_score"] = health_status["dc017"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)
  1991. health_status["dc011_score"] = health_status["dc018"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)
  1992. health_status["dc012_score"] = health_status["dc019"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)
  1993. health_status["dc013_score"] = health_status["dc020"].apply(lambda x: 4-x if (not pd.isna(x)) and x <5 else np.nan)
  1994. health_status["dc014_score"] = health_status["dc021"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)
  1995. health_status["dc015_score"] = health_status["dc022"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)
  1996. health_status["dc016_score"] = health_status["dc023"].apply(lambda x: 4-x if (not pd.isna(x)) and x <5 else np.nan)
  1997. health_status["dc017_score"] = health_status["dc024"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)
  1998. health_status["dc018_score"] = health_status["dc025"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)
  1999. data_2020["psychiatric_score"] = health_status["dc009_score"] + health_status["dc010_score"] + health_status["dc011_score"] + \
  2000. health_status["dc012_score"] + health_status["dc013_score"] + health_status["dc014_score"] + health_status["dc015_score"] + \
  2001. health_status["dc016_score"] + health_status["dc017_score"] + health_status["dc018_score"]
  2002. #睡眠状态
  2003. # (1)Rarely or none of the time (<1 day) 很少或者根本没有(<1天)
  2004. # (2)Some or a little of the time (1-2 days) 不太多(1-2天)
  2005. # (3)Occasionally or a moderate amount of the time (3-4 days) 有时或者说有一半的时间(3-4天)
  2006. # (4)Most or all of the time (5-7 days) 大多数的时间(5-7天)
  2007. data_2020["sleep_state"] = health_status['dc022'].apply(lambda x : np.nan if x >900 else x)
  2008. #ADL
  2009. health_status["db010_score"] = health_status["db001"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
  2010. health_status["db011_score"] = health_status["db003"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
  2011. health_status["db012_score"] = health_status["db005"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
  2012. health_status["db013_score"] = health_status["db007"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
  2013. health_status["db014_score"] = health_status["db009"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
  2014. health_status["db015_score"] = health_status["db011"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
  2015. data_2020["ADL"] = health_status["db010_score"] + health_status["db011_score"] + health_status["db012_score"] + health_status["db013_score"] + \
  2016. health_status["db014_score"] + health_status["db015_score"]
  2017. # 是否有管道煤气或天然气?
  2018. houseing["Gas_Connection"] = houseing["i018"].apply(lambda x : 1 if x ==1 else 0 if x == 2 else np.nan)
  2019. # 是否带供暖设施(不包括土暖气和可制暖的空调)?
  2020. houseing["Heating_Facility"] = houseing["i019"].apply(lambda x : 1 if x ==1 else 0 if x == 2 else np.nan)
  2021. # 供暖所用的主要能源是什么?
  2022. # (1)Solar 太阳能
  2023. # (2)Coal 煤炭、蜂窝煤
  2024. # (3)Natural gas 管道天然气或煤气
  2025. # (4)Liquefied Petroleum Gas 液化石油气
  2026. # (5)Electric 电
  2027. # (6)Crop residue/Wood buring 秸秆、柴火
  2028. # (7)Other 其他
  2029. houseing["Heating_Energy"] = houseing["i020"].apply(lambda x : np.nan if x==8 else x )
  2030. # 做饭用的主要燃料是什么?
  2031. # (1)Coal 煤炭、蜂窝煤
  2032. # (2)Natural gas 管道天然气或煤气
  2033. # (3)Marsh gas 沼气
  2034. # (4)Liquefied Petroleum Gas 液化石油气
  2035. # (5)Electric 电
  2036. # (6)crop residue/Wood burning 秸秆、柴火
  2037. # (7)other 其他
  2038. houseing["Cooking_Fuel"] = houseing["i021"].apply(lambda x : np.nan if x==9 else 7 if x == 8 else x)
  2039. houseing_select = houseing[['householdID', 'communityID','Gas_Connection',
  2040. 'Heating_Facility', 'Heating_Energy', 'Cooking_Fuel']]
  2041. data_2020 = pd.merge(data_2020, houseing_select, on = ['householdID', 'communityID'], how="left")
  2042. data_2020["wave"] = year
  2043. change_columns(data_2020)
  2044. data_2020 = pd.concat([data_2018, data_2020], axis=0)
  2045. #修改地区名称
  2046. #省份、城市名称和污染物数据格式对齐
  2047. #海东地区->海东市
  2048. data_2020['city'] = data_2020['city'].replace('海东地区', '海东市')
  2049. #北京 -> 北京市
  2050. data_2020['city'] = data_2020['city'].replace('北京', '北京市')
  2051. data_2020['province'] = data_2020['province'].replace('北京', '北京市')
  2052. #哈尔滨 -> 哈尔滨市
  2053. data_2020['city'] = data_2020['city'].replace('哈尔滨', '哈尔滨市')
  2054. #天津 -> 天津市
  2055. data_2020['city'] = data_2020['city'].replace('天津', '天津市')
  2056. data_2020['province'] = data_2020['province'].replace('天津', '天津市')
  2057. #广西省 -> 广西壮族自治区
  2058. data_2020['province'] = data_2020['province'].replace('广西省', '广西壮族自治区')
  2059. #巢湖市 -> 合肥市
  2060. data_2020['city'] = data_2020['city'].replace('巢湖市', '合肥市')
  2061. #襄樊市->襄阳市
  2062. data_2020['city'] = data_2020['city'].replace('襄樊市', '襄阳市')
  2063. data_2020.to_csv("/root/r_base/CHARLS/result_all_new.csv", index=False)
  2064. print(123)