Browse Source

处理CLHLS数据库

JazzZhao 1 month ago
parent
commit
2e76179279

+ 0 - 60
CHARLS_P/different.py

@@ -1,60 +0,0 @@
-import pandas as pd
-from glob import glob
-
-year = "2011"
-path = "/root/r_base/CHARLS/CHARLS"
-
-if __name__ == "__main__":
-    year = "2011"
-    files = glob(path+"2011/*.dta")
-    var_2011 = []
-    for file_name in files:
-        data = pd.read_stata(file_name)
-        var_2011 += data.columns.to_list()
-    year = "2013"
-    files = glob(path+"2013/*.dta")
-    var_2013 = []
-    for file_name in files:
-        data = pd.read_stata(file_name)
-        var_2013 += data.columns.to_list()
-    #获取2013新增变量
-    var_2011 = set(var_2011)
-    result_2013 = [elem for elem in var_2013 if elem not in var_2011]
-    with open("2013.csv", "w") as f2013:
-        f2013.write('\n'.join(result_2013) + '\n')
-
-    year = "2015"
-    files = glob(path+"2015/*.dta")
-    var_2015 = []
-    for file_name in files:
-        data = pd.read_stata(file_name)
-        var_2015 += data.columns.to_list()
-    #获取2015新增变量
-    var_2013 = set(var_2013)
-    result_2015 = [elem for elem in var_2015 if elem not in var_2013]
-    with open("2015.csv", "w") as f2015:
-        f2015.write('\n'.join(result_2015) + '\n')
-
-    year = "2018"
-    files = glob(path+"2018/*.dta")
-    var_2018 = []
-    for file_name in files:
-        data = pd.read_stata(file_name)
-        var_2018 += data.columns.to_list()
-    #获取2018新增变量
-    var_2015 = set(var_2015)
-    result_2018 = [elem for elem in var_2018 if elem not in var_2015]
-    with open("2018.csv", "w") as f2018:
-        f2018.write('\n'.join(result_2018) + '\n')
-
-    year = "2020"
-    files = glob(path+"2020/*.dta")
-    var_2020 = []
-    for file_name in files:
-        data = pd.read_stata(file_name)
-        var_2020 += data.columns.to_list()
-    #获取2020新增变量
-    var_2018 = set(var_2018)
-    result_2020 = [elem for elem in var_2020 if elem not in var_2018]
-    with open("2020.csv", "w") as f2020:
-        f2020.write('\n'.join(result_2020) + '\n')

+ 21 - 0
CLHLS_P/CLHLS_NDVI.py

@@ -0,0 +1,21 @@
+import pandas as pd
+
+if __name__ == "__main__":
+    years = [2011, 2014, 2018]
+
+    #读取CHARLS数据
+    CLHLS_data = pd.read_csv("CLHLS/clhls_1998_2018_p_n_m.csv")
+    CLHLS_data.to_csv("CLHLS/clhls_1998_2018_p_n_m_nd.csv",index=False)
+    CLHLS_data = pd.read_csv("CLHLS/clhls_1998_2018_p_n_m_nd.csv")
+    
+    #读取NDVI数据
+    ndvi_data = pd.read_excel(f"NDVI/【立方数据学社】省份等级的逐年NDVI.xlsx")
+
+    for year in years:
+        #两个表合并
+        table_merge = pd.merge(CLHLS_data, ndvi_data, left_on="province", right_on="PR", how='left')
+        #更新CLHLS表
+        CLHLS_data['last_year_ndvi'+"_"+str(year)[-2:]] = table_merge[str(year-1)].values
+        CLHLS_data['before_last_ndvi'+"_"+str(year)[-2:]] = table_merge[str(year-2)].values
+        print(year)
+    CLHLS_data.to_csv("CLHLS/clhls_1998_2018_p_n_m_nd.csv",index=False)

+ 16 - 0
CLHLS_P/CLHLS_NL.py

@@ -0,0 +1,16 @@
+import pandas as pd
+
+years = [2011, 2014, 2018]
+#读取夜光数据
+pollutants_data = pd.read_csv("night_light_result_prov.csv", encoding="utf-8")
+#读取CHARLS数据
+CLHLS_data = pd.read_csv("CLHLS/clhls_1998_2018_pollutants.csv")
+print(CLHLS_data.info())
+for year in years:
+    #两个表合并
+    table_merge = pd.merge(CLHLS_data, pollutants_data, left_on="province", right_on="prov_name", how='left')
+    #更新CLHLS表
+    CLHLS_data['last_year_nl'+"_"+str(year)[-2:]] = table_merge[str(year-1)].values
+    CLHLS_data['before_last_nl'+"_"+str(year)[-2:]] = table_merge[str(year-2)].values
+    print(year)
+CLHLS_data.to_csv("CLHLS/clhls_1998_2018_p_n.csv",index=False)

+ 64 - 0
CLHLS_P/CLHLS_PM.py

@@ -0,0 +1,64 @@
+import pandas as pd
+from glob import glob
+import os
+
+def pollutant_handle(path):
+    years = [2011, 2014, 2018]
+    #读取污染物数据
+    pollutants_data = pd.read_csv("pollution/result_SO2_p_prov.csv")
+    for year in years:
+        CLHLS_data = pd.read_csv(path)
+        print(CLHLS_data.info())
+        #开始筛选出year的数据
+        # CLHLS_data_year = CLHLS_data[CLHLS_data['wave']==year]
+        #两个表合并
+        table_merge = pd.merge(CLHLS_data, pollutants_data, on=['province', 'province'], how='left')
+        if str(year - 1) in table_merge.columns:
+            #更新CLHLS表
+            CLHLS_data['last_year_SO2'+"_"+str(year)[-2:]] = table_merge[str(year-1)].values
+        if str(year - 2) in table_merge.columns:
+            CLHLS_data['before_last_SO2'+"_"+str(year)[-2:]] = table_merge[str(year-2)].values
+        CLHLS_data.to_csv(path,index=False)
+        print(year)
+
+def aba_handle(path_data):
+    years = [2011, 2014, 2018]
+    for year in years:
+        CLHLS_data = pd.read_csv(path_data)
+        path = "aba627/result/"
+        #读取污染物组分
+        last_year_file_name = path+str(year-1)+"_PM25_and_species_p_prov.csv"
+        before_last_file_name = path+str(year-2)+"_PM25_and_species_p_prov.csv"
+        last_year_pollutants_data = pd.read_csv(last_year_file_name)
+        before_last_pollutants_data = pd.read_csv(before_last_file_name)
+        #开始筛选出year的数据
+        # CLHLS_data_year = CLHLS_data[CLHLS_data['wave']==year]
+        #和上一年的污染物组分文件合并
+        last_table_merge = pd.merge(CLHLS_data, last_year_pollutants_data, on=['province', 'province'], how='left')
+        CLHLS_data['last_year_SO4'+"_"+str(year)[-2:]] = last_table_merge["SO4"].values
+        CLHLS_data['last_year_NO3'+"_"+str(year)[-2:]] = last_table_merge["NO3"].values
+        CLHLS_data['last_year_NH4'+"_"+str(year)[-2:]] = last_table_merge["NH4"].values
+        CLHLS_data['last_year_OM'+"_"+str(year)[-2:]] = last_table_merge["OM"].values
+        CLHLS_data['last_year_BC'+"_"+str(year)[-2:]] = last_table_merge["BC"].values
+        #和上上年的污染物组分文件合并
+        before_last_table_merge = pd.merge(CLHLS_data, before_last_pollutants_data, on=['province', 'province'], how='left')
+        CLHLS_data['before_last_SO4'+"_"+str(year)[-2:]] = before_last_table_merge["SO4"].values
+        CLHLS_data['before_last_NO3'+"_"+str(year)[-2:]] = before_last_table_merge["NO3"].values
+        CLHLS_data['before_last_NH4'+"_"+str(year)[-2:]] = before_last_table_merge["NH4"].values
+        CLHLS_data['before_last_OM'+"_"+str(year)[-2:]] = before_last_table_merge["OM"].values
+        CLHLS_data['before_last_BC'+"_"+str(year)[-2:]] = before_last_table_merge["BC"].values
+        #更新CLHLS表
+        CLHLS_data.to_csv(path_data,index=False)
+        print(year)
+
+if __name__ == "__main__":
+    #读取CHARLS数据
+    path = "CLHLS/clhls_1998_2018_pollutants.csv"
+    # CLHLS_data = pd.read_csv("CLHLS/clhls_1998_2018_result.csv")
+    # print(CLHLS_data.info())
+    # CLHLS_data.to_csv("CLHLS/clhls_1998_2018_pollutants.csv",index=False)
+    
+    #处理污染物
+    # pollutant_handle(path)
+    #处理PM2.5组分
+    aba_handle(path)

+ 125 - 0
CLHLS_P/CLHLS_meteorology.py

@@ -0,0 +1,125 @@
+import pandas as pd
+
+def sunlight(CLHLS_data):
+    years = [2011, 2014, 2018]
+    for year in years:
+        #读取日照数据
+        sunlight_data_last = pd.read_excel(f"气象数据/2000-2020年按年逐年日照/【{year-1}年】逐年日照.xlsx")
+        sunlight_data_last = sunlight_data_last[["省份", "累积日照"]]
+        sunlight_data_last = sunlight_data_last.groupby('省份', as_index=False).mean()
+        sunlight_data_last = sunlight_data_last.rename(columns={"累积日照":"last_sunlight"})
+        sunlight_data_before_last = pd.read_excel(f"气象数据/2000-2020年按年逐年日照/【{year-2}年】逐年日照.xlsx")
+        sunlight_data_before_last = sunlight_data_before_last[["省份", "累积日照"]]
+        sunlight_data_before_last = sunlight_data_before_last.groupby('省份', as_index=False).mean()
+        sunlight_data_before_last = sunlight_data_before_last.rename(columns={"累积日照":"before_sunlight"})
+        #合并日照
+        table_merge = pd.merge(CLHLS_data, sunlight_data_last, left_on="province", right_on="省份", how='left')
+        table_merge = pd.merge(table_merge, sunlight_data_before_last, left_on="province", right_on="省份", how='left')
+        #更新CLHLS表
+        CLHLS_data['last_year_sunlight'+"_"+str(year)[-2:]] = table_merge['last_sunlight'].values
+        CLHLS_data['before_last_sunlight'+"_"+str(year)[-2:]] = table_merge['before_sunlight'].values
+        CLHLS_data.to_csv("CLHLS/clhls_1998_2018_p_n_m.csv",index=False)
+        print(year)
+
+def wind(CLHLS_data):
+    years = [2011, 2014, 2018]
+    for year in years:
+        #读取日照数据
+        wind_data_last = pd.read_excel(f"气象数据/2000-2020年按年逐年风速/【{year-1}年】逐年风速.xlsx")
+        wind_data_last = wind_data_last[["省份", "平均风速"]]
+        wind_data_last = wind_data_last.groupby('省份', as_index=False).mean()
+        wind_data_last = wind_data_last.rename(columns={"平均风速":"last_wind"})
+        wind_data_before_last = pd.read_excel(f"气象数据/2000-2020年按年逐年风速/【{year-2}年】逐年风速.xlsx")
+        wind_data_before_last = wind_data_before_last[["省份", "平均风速"]]
+        wind_data_before_last = wind_data_before_last.groupby('省份', as_index=False).mean()
+        wind_data_before_last = wind_data_before_last.rename(columns={"平均风速":"before_wind"})
+        #合并日照
+        table_merge = pd.merge(CLHLS_data, wind_data_last, left_on="province", right_on="省份", how='left')
+        table_merge = pd.merge(table_merge, wind_data_before_last, left_on="province", right_on="省份", how='left')
+        #更新CLHLS表
+        CLHLS_data['last_year_wind'+"_"+str(year)[-2:]] = table_merge['last_wind'].values
+        CLHLS_data['before_last_wind'+"_"+str(year)[-2:]] = table_merge['before_wind'].values
+        CLHLS_data.to_csv("CLHLS/clhls_1998_2018_p_n_m.csv",index=False)
+        print(year)
+
+def rain(CLHLS_data):
+    years = [2011, 2014, 2018]
+    for year in years:
+        #读取日照数据
+        rain_data_last = pd.read_excel(f"气象数据/2000-2020年按年逐年降水/【{year-1}年】逐年降水.xlsx")
+        rain_data_last = rain_data_last[["省份", "累积降水"]]
+        rain_data_last = rain_data_last.groupby('省份', as_index=False).mean()
+        rain_data_last = rain_data_last.rename(columns={"累积降水":"last_rain"})
+        rain_data_before_last = pd.read_excel(f"气象数据/2000-2020年按年逐年降水/【{year-2}年】逐年降水.xlsx")
+        rain_data_before_last = rain_data_before_last[["省份", "累积降水"]]
+        rain_data_before_last = rain_data_before_last.groupby('省份', as_index=False).mean()
+        rain_data_before_last = rain_data_before_last.rename(columns={"累积降水":"before_rain"})
+        #合并日照
+        table_merge = pd.merge(CLHLS_data, rain_data_last, left_on="province", right_on="省份", how='left')
+        table_merge = pd.merge(table_merge, rain_data_before_last, left_on="province", right_on="省份", how='left')
+        #更新CLHLS表
+        CLHLS_data['last_year_rain'+"_"+str(year)[-2:]] = table_merge['last_rain'].values
+        CLHLS_data['before_last_rain'+"_"+str(year)[-2:]] = table_merge['before_rain'].values
+        CLHLS_data.to_csv("CLHLS/clhls_1998_2018_p_n_m.csv",index=False)
+        print(year)
+
+def temperature(CLHLS_data):
+    years = [2011, 2014, 2018]
+    for year in years:
+        #读取日照数据
+        temperature_data_last = pd.read_excel(f"气象数据/2000-2020年按年逐年气温/【{year-1}年】逐年气温.xlsx")
+        temperature_data_last = temperature_data_last[["省份", "平均气温"]]
+        temperature_data_last = temperature_data_last.groupby('省份', as_index=False).mean()
+        temperature_data_last = temperature_data_last.rename(columns={"平均气温":"last_temperature"})
+        temperature_data_before_last = pd.read_excel(f"气象数据/2000-2020年按年逐年气温/【{year-2}年】逐年气温.xlsx")
+        temperature_data_before_last = temperature_data_before_last[["省份", "平均气温"]]
+        temperature_data_before_last = temperature_data_before_last.groupby('省份', as_index=False).mean()
+        temperature_data_before_last = temperature_data_before_last.rename(columns={"平均气温":"before_temperature"})
+        #合并日照
+        table_merge = pd.merge(CLHLS_data, temperature_data_last, left_on="province", right_on="省份", how='left')
+        table_merge = pd.merge(table_merge, temperature_data_before_last, left_on="province", right_on="省份", how='left')
+        #更新CLHLS表
+        CLHLS_data['last_year_temperature'+"_"+str(year)[-2:]] = table_merge['last_temperature'].values
+        CLHLS_data['before_last_temperature'+"_"+str(year)[-2:]] = table_merge['before_temperature'].values
+        CLHLS_data.to_csv("CLHLS/clhls_1998_2018_p_n_m.csv",index=False)
+        print(year)
+
+def humidity(CLHLS_data):
+    years = [2011, 2014, 2018]
+    for year in years:
+        #读取日照数据
+        humidity_data_last = pd.read_excel(f"气象数据/2000-2020年按年逐年湿度/【{year-1}年】逐年湿度.xlsx")
+        humidity_data_last = humidity_data_last[["省份", "平均湿度"]]
+        humidity_data_last = humidity_data_last.groupby('省份', as_index=False).mean()
+        humidity_data_last = humidity_data_last.rename(columns={"平均湿度":"last_humidity"})
+        humidity_data_before_last = pd.read_excel(f"气象数据/2000-2020年按年逐年湿度/【{year-2}年】逐年湿度.xlsx")
+        humidity_data_before_last = humidity_data_before_last[["省份", "平均湿度"]]
+        humidity_data_before_last = humidity_data_before_last.groupby('省份', as_index=False).mean()
+        humidity_data_before_last = humidity_data_before_last.rename(columns={"平均湿度":"before_humidity"})
+        #合并日照
+        table_merge = pd.merge(CLHLS_data, humidity_data_last, left_on="province", right_on="省份", how='left')
+        table_merge = pd.merge(table_merge, humidity_data_before_last, left_on="province", right_on="省份", how='left')
+        #更新CLHLS表
+        CLHLS_data['last_year_humidity'+"_"+str(year)[-2:]] = table_merge['last_humidity'].values
+        CLHLS_data['before_last_humidity'+"_"+str(year)[-2:]] = table_merge['before_humidity'].values
+        CLHLS_data.to_csv("CLHLS/clhls_1998_2018_p_n_m.csv",index=False)
+        print(year)
+
+if __name__ == "__main__":
+    #读取CLHLS数据
+    CLHLS_data = pd.read_csv("CLHLS/clhls_1998_2018_p_n.csv")
+    CLHLS_data.to_csv("CLHLS/clhls_1998_2018_p_n_m.csv",index=False)
+    CLHLS_data = pd.read_csv("CLHLS/clhls_1998_2018_p_n_m.csv")
+    sunlight(CLHLS_data)
+    CLHLS_data = pd.read_csv("CLHLS/clhls_1998_2018_p_n_m.csv")
+    wind(CLHLS_data)
+    CLHLS_data = pd.read_csv("CLHLS/clhls_1998_2018_p_n_m.csv")
+    rain(CLHLS_data)
+    CLHLS_data = pd.read_csv("CLHLS/clhls_1998_2018_p_n_m.csv")
+    temperature(CLHLS_data)
+    CLHLS_data = pd.read_csv("CLHLS/clhls_1998_2018_p_n_m.csv")
+    humidity(CLHLS_data)
+
+
+
+

+ 7 - 2
CLHLS_P/CLHLS_process.py

@@ -61,6 +61,11 @@ def deal_1998_2018_data():
     data = pd.read_csv(csv_file_path)
     # 存活状态0存活;1死亡;-9失访;-8死亡/失访
     result = data[['id', 'dth98_00','dth00_02', 'dth02_05', 'dth02_05', 'dth05_08', 'dth08_11', 'dth11_14', 'dth14_18']]
+    # 省份
+    csv_prov = "CLHLS/provice_code.csv"
+    provice_data = pd.read_csv(csv_prov)
+    data = data.merge(provice_data, left_on='prov', right_on='prov_code', how='left')
+    result["province"] = data["prov_name"]
     # 人口特征学变量
     # 8/9代表无法回答和缺失
     # 年龄
@@ -431,8 +436,8 @@ def deal_2008_2018_data():
     result.to_csv("CLHLS/clhls_2008_2018_result.csv", index=False)
 
 if __name__ == "__main__":
-    # deal_1998_2018_data()
-    deal_2008_2018_data()
+    deal_1998_2018_data()
+    # deal_2008_2018_data()
     print(123)
 
 

+ 21 - 0
CLHLS_P/nl_process.py

@@ -0,0 +1,21 @@
+import pandas as pd
+from glob import glob
+
+csv_prov = "CLHLS/provice_code.csv"
+provice_data = pd.read_csv(csv_prov)
+
+#读取夜光数据
+night_light_result = pd.read_csv("night_light_result.csv")
+
+night_light_result["prov_code"] = night_light_result['id'].astype(str).apply(lambda x: x[:2])
+
+provice_data['prov_code'] = provice_data['prov_code'].astype(str)
+
+night_light_result = night_light_result.merge(provice_data, left_on='prov_code', right_on='prov_code', how='left')
+
+# id 'ext_name' 列,对其他列进行 groupby 和 mean 计算
+grouped = night_light_result.drop(columns=['ext_name', "id", "prov_code"]).groupby('prov_name', as_index=False).mean()
+
+grouped.to_csv("night_light_result_prov.csv", encoding="utf-8", index=False)    
+
+    

+ 29 - 0
CLHLS_P/pollution_process.py

@@ -0,0 +1,29 @@
+import pandas as pd
+from glob import glob
+import os
+
+def pollutant_provice_handle():
+    path = "pollution/result_pm10_1km_p"
+    data = pd.read_csv(path+".csv")
+    # 排除 'province' 和 'city' 列,对其他列进行 groupby 和 mean 计算
+    grouped = data.drop(columns=['city']).groupby('province').mean()
+    # 输出结果,结果中包含 'province' 列
+    grouped.reset_index(inplace=True)
+    grouped.to_csv(path+"_prov.csv", index=False)
+
+def aba_provice_handle():
+    path = "aba627/result/"
+    files = glob(path+"*_p.csv")
+    for file in files:
+        data = pd.read_csv(file)
+        # 排除 'province' 和 'city' 列,对其他列进行 groupby 和 mean 计算
+        grouped = data.drop(columns=['city']).groupby('province').mean()
+        # 输出结果,结果中包含 'province' 列
+        grouped.reset_index(inplace=True)
+        tmp = os.path.basename(file)
+        file_name, extension = os.path.splitext(tmp)
+        grouped.to_csv(path+file_name+"_prov"+extension, index=False)
+
+if __name__ == "__main__":
+    # pollutant_provice_handle()
+    aba_provice_handle()

+ 0 - 3
time.md

@@ -26,9 +26,6 @@ docker run -itd -p 8063:22 -v /home/ubuntu/PaddleFaq/r_base:/root/r_base --secur
 docker run -itd -p 17860:7860 -p 18022:22 -p 17861:7861 -v /home/zhengzhou/seatassistant/java_docker:/root  --cap-add=NET_ADMIN --device /dev/net/tun --name=java_base java_base:latest
 
 
-192.168.210.83
-
-255.255.255.0
 # 备忘
 ---
 飞天: