Python——階段總結(一)


import xlrd # 讀xlsx
import xlsxwriter # 寫xlsx
import urllib.request # url請求,Python3自帶,Python2與3中urllib的區別見:http://blog.csdn.net/Jurbo/article/details/52313636
import os # 創建output文件夾
import glob # 獲取文件夾下文件名稱
import time # 記錄時間
import json # 讀取json格式文件

def xlsx_merge(folder,header,filename):
fileList
= []
for fileName in glob.glob(folder + "*.xlsx"):
fileList.append(fileName)
fileNum
= len(fileList)
matrix
= [None] * fileNum
for i in range(fileNum):
fileName
= fileList[i]
workBook
= xlrd.open_workbook(fileName)
try:
sheet
= workBook.sheet_by_index(0)
except Exception as e:
print(e)
nRows
= sheet.nrows
matrix[i]
= [0]*(nRows - 1)
nCols
= sheet.ncols
for m in range(nRows - 1):
matrix[i][m]
= ["0"]* nCols
for j in range(1,nRows):
for k in range(nCols):
matrix[i][j
-1][k] = sheet.cell(j,k).value
fileName
= xlsxwriter.Workbook(folder + filename + ".xlsx")
sheet
= fileName.add_worksheet("merged")
for i in range(len(header)):
sheet.write(0,i,header[i])
rowIndex
= 1
for fileIndex in range(fileNum):
for j in range(len(matrix[fileIndex])):
for colIndex in range (len(matrix[fileIndex][j])):
sheet.write(rowIndex,colIndex,matrix[fileIndex][j][colIndex])
rowIndex
+= 1
print("已完成%d個文件的合並"%fileNum)
fileName.close()

def poi_by_adcode_poicode(folder,city_file = "city",poi_file = "poi",merge_or_not = 1):
city_file
= city_file
poi_file
= poi_file
merge_or_not
= merge_or_not
header_full
= ["id","name","type","typecode","biz_type","address","location","tel","pname","cityname","adname","rating","cost"] #返回結果控制為base時,輸出的POI標簽類別
header = ["id","name","type","typecode","biz_type","address","location","tel","pname","cityname","adname"]
offset
= 25 # 實例設置每頁展示10條POI(官方限定25條,實際測試可以為50)
# 讀取列表
folder_sheet = xlrd.open_workbook(folder + "input/" + "folder.xlsx").sheet_by_index(0)
folder_list
= folder_sheet.col_values(0)
folder_code_list
= folder_sheet.col_values(1)
city_sheet
= xlrd.open_workbook(folder+ "input/" + city_file + ".xlsx").sheet_by_index(0)
city_list
=city_sheet.col_values(0)
city_code_list
= city_sheet.col_values(1)
poi_type_sheet
= xlrd.open_workbook(folder+ "input/" + poi_file + ".xlsx").sheet_by_index(0)
poi_type_list
= poi_type_sheet.col_values(1)
total_work
= (city_sheet.nrows - 1) * (poi_type_sheet.nrows - 1) # 指示工作總量
city_col_index = 1 # 用於記錄上次已經讀取到的行數
work_index = 1
print(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())) + ":抓取開始!")
for folder_index in range(1,len(folder_list)): # 區分地級市
output_folder = folder + folder_list[folder_index] +"/" # 創建輸出路徑
if os.path.isdir(output_folder):
pass
else:
os.makedirs(output_folder)
for city_index in range(city_col_index,len(city_list)): # 對行政區
if folder_code_list[folder_index][0:4] == city_code_list[city_index][0:4]: # 如果前四數字相同,則在該地級市目錄下寫入,否則退出循環
for poi_type_index in range(1,len(poi_type_list)): # 對興趣點類別
workbook_file = output_folder + str(city_list[city_index]) + str(poi_type_list[poi_type_index]) + ".xlsx"
if os.path.exists(workbook_file):
print(str(city_list[city_index]) + str(poi_type_list[poi_type_index]) + " 已存在")
else:
workbook
=xlsxwriter.Workbook(workbook_file) # 新建工作簿
sheet = workbook.add_worksheet("result") # 新建工作表
for col_index in range(len(header_full)):
sheet.write(0,col_index,header_full[col_index])
# 寫表頭
row_index = 1
for page_index in range(1, 101): # 制定行政區和興趣點類別后,POI信息已固定, 現針對頁數寫入
try:
url
= "http://restapi.amap.com/v3/place/text?&keywords=&types=" + str(poi_type_list[poi_type_index]) + "&city=" + city_code_list[city_index] + "&citylimit=true&offset=" + str(offset) + "&page="+ str(page_index) + "&key=你的key&extensions=all"
# 請求的結構化url地址如上,見:http://lbs.amap.com/api/webservice/guide/api/search/
data = json.load(urllib.request.urlopen(url))["pois"]
for i in range(offset):
for col_index in range(len(header)):
sheet.write(row_index, col_index, str(data[i][header[col_index]]))
# 寫入簡略表頭內容
sheet.write(row_index,len(header),str(data[i]["biz_ext"]["rating"])) # 寫入詳細表頭內容
sheet.write(row_index,len(header) + 1,str(data[i]["biz_ext"]["cost"]))
row_index
+= 1
except:
break
workbook.close()
work_index
= (city_index - 1) * len(poi_type_list) + poi_type_index
print(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())) + "" + str(city_list[city_index]) + " " + str(poi_type_list[poi_type_index]) + " 已獲取!進度:%.3f%%" %(work_index / total_work *100))
city_col_index
+= 1
else:
break
print(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())) + "" + folder_list[folder_index] + "已完成!")

if merge_or_not == 1:
if os.path.exists(output_folder + folder_list[folder_index] + ".xlsx"):
pass
else:
xlsx_merge(output_folder, header_full, folder_list[folder_index])
print(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())) + ":已對文件進行合並!")
else:
print(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())) + ":未進行合並!")
print(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())) + ":所有工作完成!")

poi_by_adcode_poicode(
"E:/XXDir/","city", "poi",1)

1、如果是將數據儲存在excel中(存儲在數據庫中時,不用考慮),最好將結果分階段保存成單獨文件並及時輸出時間和進度

2、可以通過判斷以前保存的文件是否存在達到斷點續爬的目的,也可以通過此方式,實現多主機共享進度(農村人的分布式爬取^-^,通過建立共享文件夾)。

3、做多重循環時,要考慮清楚循環之間的步驟應該置哪個循環之下。

4、通過將復雜的功能拆分成多個小的功能,可以更好的完成一段復雜代碼的編寫。

5、盡可能將實現的功能編寫成函數和庫,以便下次調用。


注意!

本站转载的文章为个人学习借鉴使用,本站对版权不负任何法律责任。如果侵犯了您的隐私权益,请联系我们删除。



 
  © 2014-2022 ITdaan.com 联系我们: