test.py 2.6 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677
  1. import pandas as pd
  2. # data = pd.read_csv("CLHLS/clhls_1998_2018_result.csv")
  3. # print(data.shape)
  4. # data = pd.read_csv("HRS/result_all.csv")
  5. # print(data.shape)
  6. # # 去重并统计ID个数
  7. # unique_ids = data.drop_duplicates(subset=["HHID", "PN"])
  8. # count_unique_ids = unique_ids.count()
  9. # print(count_unique_ids)
  10. # data = pd.read_csv("/root/r_base/UKDA-5050-stata/result_all.csv")
  11. # print(data.shape)
  12. # # 去重并统计ID个数
  13. # unique_ids = data.drop_duplicates(subset=["id"])
  14. # count_unique_ids = unique_ids.count()
  15. # print(count_unique_ids)
  16. # df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_9_elsa_data_eul_v1.dta', convert_categoricals=False)
  17. # print(df.shape)
  18. # df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_9_elsa_pensiongrid_eul_v2.dta', convert_categoricals=False)
  19. # print(df.shape)
  20. # df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_9_financial_derived_variables.dta', convert_categoricals=False)
  21. # print(df.shape)
  22. # df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_9_ifs_derived_variables.dta', convert_categoricals=False)
  23. # print(df.shape)
  24. # 指定文件夹路径
  25. # import glob
  26. # import os
  27. # folder_path = '/root/r_base/NHANES/2017-2018'
  28. # # 获取所有 .xpt 文件的路径
  29. # xpt_files = glob.glob(os.path.join(folder_path, '*.XPT'))
  30. # num = 0
  31. # # 读取并处理每一个 .xpt 文件
  32. # for file_path in xpt_files:
  33. # try:
  34. # # 使用 pandas 读取 .xpt 文件
  35. # df = pd.read_sas(file_path, format='xport')
  36. # # 输出数据框的前几行以进行检查
  37. # print(f"Data from {file_path}:")
  38. # print(df.shape)
  39. # num += df.shape[1]
  40. # except Exception as e:
  41. # print(f"Error reading {file_path}: {e}")
  42. # print(num)
  43. # data = pd.read_csv("/root/r_base/CHARLS/result_all.csv")
  44. # print(data.shape)
  45. # # 去重并统计ID个数
  46. # unique_ids = data.drop_duplicates(subset=["householdID"])
  47. # count_unique_ids = unique_ids.count()
  48. # print(count_unique_ids)
  49. # 指定文件夹路径
  50. # import glob
  51. # import os
  52. # folder_path = '/root/r_base/CHARLS/CHARLS2018'
  53. # # 获取所有 .xpt 文件的路径
  54. # xpt_files = glob.glob(os.path.join(folder_path, '*.dta'))
  55. # num = 0
  56. # # 读取并处理每一个 .xpt 文件
  57. # for file_path in xpt_files:
  58. # try:
  59. # # 使用 pandas 读取 .xpt 文件
  60. # df = pd.read_stata(file_path)
  61. # # 输出数据框的前几行以进行检查
  62. # print(f"Data from {file_path}:")
  63. # print(df.shape)
  64. # num += df.shape[1]
  65. # except Exception as e:
  66. # print(f"Error reading {file_path}: {e}")
  67. # print(num)