1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159
|
import csv import re
def LabelData(elem, labeled_col_list, label): """ 对大量数据自动编号 :return: 元组,其中的元素为最终编号和下一待编号 """ if elem not in labeled_col_list: labeled_col_list.append(elem) ret = label label += 1 else: ret = labeled_col_list.index(elem) + 1 return ret, label
labeled_col_1, labeled_col_2, labeled_col_3 = [], [], [] label1, label2, label3 = 1, 1, 1
total_term = 0
rows = []
with open('ProcessedTianjinRentHouseInfo.csv', 'r', newline='') as csv_in_file: with open('LabeledTianjinRentHouseInfo.csv', 'w', newline='') as csv_out_file: filereader = csv.reader(csv_in_file) filewriter = csv.writer(csv_out_file)
head = next(filereader) filewriter.writerow(head)
for row_list in filereader: row_list[0] = 1 row_1_data = LabelData(row_list[1], labeled_col_1, label1) row_list[1] = row_1_data[0] label1 = row_1_data[1] row_2_data = LabelData(row_list[2], labeled_col_2, label2) row_list[2] = row_2_data[0] label2 = row_2_data[1] row_3_data = LabelData(row_list[3], labeled_col_3, label3) row_list[3] = row_3_data[0] label3 = row_3_data[1] if row_list[5] == '整租': row_list[5] = 1 elif row_list[5] == '合租': row_list[5] = 2 aspect_dic = { '东': 1, '南': 2, '西': 3, '北': 4, '东南': 5, '东北': 6, '西南': 7, '西北': 8 } row_list[6] = aspect_dic[row_list[6]] charge_mode_dic = { '月付价': 1, '季付价': 2, '半年付价': 3, '年付价': 4, 'None': 5 } row_list[8] = charge_mode_dic[row_list[8]] if row_list[12] == '随时入住': row_list[12] = 1 if row_list[13] != '暂无数据': if re.search(r'年', row_list[13]): term = int(re.sub(r'(\D)', ' ', row_list[13]).split()[0]) * 12 elif re.search(r'月', row_list[13]): term = int(re.sub(r'(\D)', ' ', row_list[13]).split()[0]) row_list[13] = term total_term += term see_house_dic = { '随时可看': 1, '需提前预约': 2, '一般下班后可看': 3 } row_list[14] = see_house_dic[row_list[14]] floor_dic = { '低楼层': 1, '中楼层': 2, '高楼层': 3 } row_list[15] = floor_dic[row_list[15]] lift_dic = { '有': 1, '无': 2, '暂无数据': 3 } row_list[17] = lift_dic[row_list[17]] stall_dic = { '暂无数据': 1, '免费使用': 2, '租用车位': 3 } row_list[18] = stall_dic[row_list[18]] water_dic = { '民水': 1, '商水': 2, '暂无数据': 3 } row_list[19] = water_dic[row_list[19]] elec_dic = { '民电': 1, '商电': 2, '暂无数据': 3 } row_list[20] = elec_dic[row_list[20]] gas_dic = { '有': 1, '无': 2, '暂无数据': 3 } row_list[21] = gas_dic[row_list[21]] heating_dic = { '集中供暖': 1, '自采暖': 2, '暂无数据': 3 } row_list[22] = heating_dic[row_list[22]] rows.append(row_list)
for row_list in rows: if row_list[13] == '暂无数据': row_list[13] = total_term // 1503 filewriter.writerow(row_list) print('写入成功')
|