feat(preprocessing): 添加数据预处理示例脚本
- 添加多个数据预处理示例脚本,包括离散化、去重、缺失值处理、插值、标准化等 - 新增 CheckNULL、Interpolation、Merge 等实用工具脚本 - 提供了不同的标准化方法示例,如标准差标准化、离差标准化、小数定标标准化
This commit is contained in:
commit
3f2ae8adf5
4
4-2-1.py
Normal file
4
4-2-1.py
Normal file
@ -0,0 +1,4 @@
|
||||
import pandas as pd
|
||||
num=[5,10,11,13,15,35,50,55,72,92,204,215]
|
||||
price=pd.cut(num,3)
|
||||
print(price.value_counts())
|
13
4-2-2.py
Normal file
13
4-2-2.py
Normal file
@ -0,0 +1,13 @@
|
||||
#等频离散化
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
num=[5,10,11,13,15,35,50,55,72,92,204,215]
|
||||
k=4; #设置离散后的数据段为5
|
||||
temp=[ i/k*100 for i in range(k+1)]
|
||||
print(temp)
|
||||
w=[ ]
|
||||
for item in temp:
|
||||
w.append(np.percentile(num,item))
|
||||
w[0] = w[0]*(1-1e-10)
|
||||
d2=pd.cut(num,w,labels=range(k))
|
||||
print(d2.value_counts())
|
11
4-2-3.py
Normal file
11
4-2-3.py
Normal file
@ -0,0 +1,11 @@
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
inputfile = '5Preprocessing/1.xlsx' #销量数据路径
|
||||
s = pd.read_excel(inputfile) #读入数据
|
||||
print('数据输出为:\n',s)
|
||||
#去除重复
|
||||
data1 = s['x'].drop_duplicates()
|
||||
print('数据输出为:\n',data1)
|
||||
#缺失值中位数填补
|
||||
data2=s['y'].fillna(s['y'].median())
|
||||
print(data2)
|
5
CheckNULL-2.py
Normal file
5
CheckNULL-2.py
Normal file
@ -0,0 +1,5 @@
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
inputfile = '5Preprocessing/teleco_camp_orig.csv' #销量数据路径
|
||||
camp = pd.read_csv(inputfile) #读入数据
|
||||
print("lagrange 插值前(False 为缺失值所在位置)",'\n', camp.notnull())
|
5
CheckNULL.py
Normal file
5
CheckNULL.py
Normal file
@ -0,0 +1,5 @@
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
inputfile = '5Preprocessing/teleco_camp_orig.csv' #销量数据路径
|
||||
camp = pd.read_csv(inputfile) #读入数据
|
||||
print('缺失值的位置为:\n',camp.isnull().sum())
|
12
Interpolation.py
Normal file
12
Interpolation.py
Normal file
@ -0,0 +1,12 @@
|
||||
from scipy.interpolate import lagrange
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
|
||||
inputfile = '5Preprocessing/teleco_camp_orig.csv' #销量数据路径
|
||||
camp = pd.read_csv(inputfile) #读入数据
|
||||
for i in range(2,4):
|
||||
la=lagrange(camp.iloc[:,i].dropna().index,camp.iloc[:,i].dropna().values)
|
||||
list_d=list(set(np.arange(0,20)).difference(set(camp.iloc[:,i].dropna().index)))
|
||||
camp.iloc[list_d,i]=la(list_d)
|
||||
print("第%d 列缺失值的个数为:%d"%(i,camp.iloc[:,i].isnull().sum()))
|
||||
print("lagrange 插值后(False 为缺失值所在位置)\n",camp.notnull())
|
8
Merge.py
Normal file
8
Merge.py
Normal file
@ -0,0 +1,8 @@
|
||||
import pandas as pd
|
||||
ele_loss=pd.read_csv("5Preprocessing/ele_loss.csv")
|
||||
alarm=pd.read_csv("5Preprocessing/alarm.csv", encoding='gbk')
|
||||
print("ele_loss 表的形状为",ele_loss.shape)
|
||||
print("alarm 表的形状为",alarm.shape)
|
||||
merge=pd.merge(ele_loss,alarm,left_on=["ID","date"],right_on=["ID","date"],how="inner")
|
||||
print("合并后的表形状为:",merge.shape)
|
||||
print("合并后的表为:",merge)
|
5
Read.py
Normal file
5
Read.py
Normal file
@ -0,0 +1,5 @@
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
inputfile = '5Preprocessing/teleco_camp_orig.csv' #销量数据路径
|
||||
camp = pd.read_csv(inputfile) #读入数据
|
||||
print('数据输出为:\n',camp.head(10))
|
7
Standard.py
Normal file
7
Standard.py
Normal file
@ -0,0 +1,7 @@
|
||||
import pandas as pd
|
||||
model=pd.read_excel("5Preprocessing/model.xls")
|
||||
def Standard(data):
|
||||
data=(data-data.mean())/data.std()
|
||||
return data
|
||||
S=Standard(model)
|
||||
print("标准化后的数据为:",'\n',S.head())
|
11
Standards-2.py
Normal file
11
Standards-2.py
Normal file
@ -0,0 +1,11 @@
|
||||
import pandas as pd
|
||||
num=[200,300,400,600,1000]
|
||||
df=pd.DataFrame(num)
|
||||
print(df.describe())
|
||||
df0=(df-df.min())/(df.max()-df.min()) #离差标准化
|
||||
print(df0)
|
||||
print(df.mean()) #标准差标准化
|
||||
df1=(df-df.mean())/df.std()
|
||||
print(df1)
|
||||
df2=(df/(10**4)) #小数定标标准化
|
||||
print(df2)
|
20
Standards.py
Normal file
20
Standards.py
Normal file
@ -0,0 +1,20 @@
|
||||
import pandas as pd
|
||||
model=pd.read_excel("5Preprocessing/model.xls")
|
||||
def Standard(data):
|
||||
data=(data-data.mean())/data.std()
|
||||
return data
|
||||
S=Standard(model)
|
||||
print("标准化后的数据为:",'\n',S.head())
|
||||
|
||||
def MinMaxScale(data):
|
||||
data=(data-data.min())/(data.max()-data.min())
|
||||
return data
|
||||
M=MinMaxScale(model)
|
||||
print("离差标准化后的数据为:",'\n',S.head())
|
||||
|
||||
import numpy as np
|
||||
def DecimalScaler(data):
|
||||
data=data/10**np.ceil(np.log10(data.abs().max()))
|
||||
return data
|
||||
D=DecimalScaler(model)
|
||||
print("小数定标差标准化的数据为:",'\n',D.head())
|
16
main.py
Normal file
16
main.py
Normal file
@ -0,0 +1,16 @@
|
||||
# 这是一个示例 Python 脚本。
|
||||
|
||||
# 按 Shift+F10 执行或将其替换为您的代码。
|
||||
# 按 双击 Shift 在所有地方搜索类、文件、工具窗口、操作和设置。
|
||||
|
||||
|
||||
def print_hi(name):
|
||||
# 在下面的代码行中使用断点来调试脚本。
|
||||
print(f'Hi, {name}') # 按 Ctrl+F8 切换断点。
|
||||
|
||||
|
||||
# 按装订区域中的绿色按钮以运行脚本。
|
||||
if __name__ == '__main__':
|
||||
print_hi('PyCharm')
|
||||
|
||||
# 访问 https://www.jetbrains.com/help/pycharm/ 获取 PyCharm 帮助
|
Loading…
Reference in New Issue
Block a user