### python基礎練習（五）—— 數據清洗補全處理

python基礎練習（五）—— 數據清洗補全處理

`# -*- coding: utf-8 -*-#1. 讀入  肝氣郁結證型系數.xls  數據集，將數據集按照等距、小組等量 兩種方式 分別分為5組數據，分別計算5組數據的中位數與標准差import numpy as npfrom pandas import Series, DataFrameimport pandas as pdpath = 'D:/pytest/week6data/' #輸出數據路徑df1=pd.read_excel(path+'gqyjxsj.xls',header=None,skiprows=1)df1.columns=['Values']    #設定列名df1['Group_XZDJ']=pd.cut(df1['Values'],5,precision=2) #將值列按等距方式分為5組並賦值新列df1['Group_XZDL']=pd.qcut(df1['Values'], 5, precision=2) #將值列按等距等量方式分為5組並賦值新列group_xzdl=df1['Values'].groupby(df1['Group_XZDJ'])   #將值列按等距分組列准備數據group_xzdj=df1['Values'].groupby(df1['Group_XZDL'])   #將值列按等距等量分組列准備數據group_xzdl.median()  #等距方式求中位數group_xzdl.std()  #等距方式求標准差group_xzdj.median()  #等距等量求中位數group_xzdj.std()  #等距等量求標准差#2. 讀入BHP1.csv，使用適當的方法填補缺失值from __future__ import divisionimport numpy as npimport osimport matplotlib.pyplot as pltfrom scipy.interpolate import lagrange #導入拉格朗日插值函數from pandas import Series, DataFrameimport pandas as pdinputfile = 'D:/pytest/week6data/BHP1.csv' #銷量數據路徑outputfile = 'D:/pytest/week6data/BHP1_RES.csv' #輸出數據路徑data = pd.read_csv(inputfile) #讀入數據def ployinterp_column(s, n, k=5):  y = s[list(range(n-k, n)) + list(range(n+1, n+1+k))] #取數  y = y[y.notnull()] #剔除空值  return lagrange(y.index, list(y))(n) #插值並返回插值結果#逐個元素判斷是否需要插值for i in data.columns:  for j in range(len(data)):    if (data[i].isnull())[j]: #如果為空即插值。      data[i][j] = ployinterp_column(data[i], j)data.to_csv(outputfile) #輸出結果，寫入文件#3. 讀入BHP2.xlsx，與BHP1數據集合並為BHP數據集from __future__ import divisionimport numpy as npimport osimport matplotlib.pyplot as pltfrom scipy.interpolate import lagrange #導入拉格朗日插值函數from pandas import Series, DataFrameimport pandas as pd                 inputfile = 'D:/pytest/week6data/BHP1.csv'                data = pd.read_csv(inputfile)df1 = DataFrame(data.to_records(),columns=pd.Index(['name', 'date', 'Open', 'High', 'Low', 'Close'], name='item'))inputfile = 'D:/pytest/week6data/BHP2.xlsx'                data = pd.read_excel(inputfile)df2 = DataFrame(data.to_records(),columns=pd.Index(['name', 'date', 'Open', 'High', 'Low', 'Close','volume'], name='item'))BHP=pd.merge(df1, df2,how='outer' )#4. 將BHP數據集中的成交量（volume）替換為 high、median、low 三種水平（區間自行定義）BHP=pd.merge(df1, df2,how='outer' )volume = BHP['volume']idx = 0for i in volume:    if i >= 3000000 and i < 4000000:        BHP['volume'][idx] = "median"    elif i >= 4000000 :        BHP['volume'][idx] = "high"    else:         BHP['volume'][idx] = "low "    idx+=1    BHP`

https://yunpan.cn/cP2EICgHEjUb2 訪問密碼 d3cb