feat(preprocessing): 添加数据预处理示例脚本

- 添加多个数据预处理示例脚本,包括离散化、去重、缺失值处理、插值、标准化等
- 新增 CheckNULL、Interpolation、Merge 等实用工具脚本
- 提供了不同的标准化方法示例,如标准差标准化、离差标准化、小数定标标准化
This commit is contained in:
fly6516 2025-03-30 02:55:48 +08:00
commit 3f2ae8adf5
12 changed files with 117 additions and 0 deletions

4
4-2-1.py Normal file
View File

@ -0,0 +1,4 @@
import pandas as pd
num=[5,10,11,13,15,35,50,55,72,92,204,215]
price=pd.cut(num,3)
print(price.value_counts())

13
4-2-2.py Normal file
View File

@ -0,0 +1,13 @@
#等频离散化
import pandas as pd
import numpy as np
num=[5,10,11,13,15,35,50,55,72,92,204,215]
k=4; #设置离散后的数据段为5
temp=[ i/k*100 for i in range(k+1)]
print(temp)
w=[ ]
for item in temp:
w.append(np.percentile(num,item))
w[0] = w[0]*(1-1e-10)
d2=pd.cut(num,w,labels=range(k))
print(d2.value_counts())

11
4-2-3.py Normal file
View File

@ -0,0 +1,11 @@
import pandas as pd
import numpy as np
inputfile = '5Preprocessing/1.xlsx' #销量数据路径
s = pd.read_excel(inputfile) #读入数据
print('数据输出为:\n',s)
#去除重复
data1 = s['x'].drop_duplicates()
print('数据输出为:\n',data1)
#缺失值中位数填补
data2=s['y'].fillna(s['y'].median())
print(data2)

5
CheckNULL-2.py Normal file
View File

@ -0,0 +1,5 @@
import pandas as pd
import numpy as np
inputfile = '5Preprocessing/teleco_camp_orig.csv' #销量数据路径
camp = pd.read_csv(inputfile) #读入数据
print("lagrange 插值前False 为缺失值所在位置)",'\n', camp.notnull())

5
CheckNULL.py Normal file
View File

@ -0,0 +1,5 @@
import pandas as pd
import numpy as np
inputfile = '5Preprocessing/teleco_camp_orig.csv' #销量数据路径
camp = pd.read_csv(inputfile) #读入数据
print('缺失值的位置为:\n',camp.isnull().sum())

12
Interpolation.py Normal file
View File

@ -0,0 +1,12 @@
from scipy.interpolate import lagrange
import pandas as pd
import numpy as np
inputfile = '5Preprocessing/teleco_camp_orig.csv' #销量数据路径
camp = pd.read_csv(inputfile) #读入数据
for i in range(2,4):
la=lagrange(camp.iloc[:,i].dropna().index,camp.iloc[:,i].dropna().values)
list_d=list(set(np.arange(0,20)).difference(set(camp.iloc[:,i].dropna().index)))
camp.iloc[list_d,i]=la(list_d)
print("%d 列缺失值的个数为:%d"%(i,camp.iloc[:,i].isnull().sum()))
print("lagrange 插值后False 为缺失值所在位置)\n",camp.notnull())

8
Merge.py Normal file
View File

@ -0,0 +1,8 @@
import pandas as pd
ele_loss=pd.read_csv("5Preprocessing/ele_loss.csv")
alarm=pd.read_csv("5Preprocessing/alarm.csv", encoding='gbk')
print("ele_loss 表的形状为",ele_loss.shape)
print("alarm 表的形状为",alarm.shape)
merge=pd.merge(ele_loss,alarm,left_on=["ID","date"],right_on=["ID","date"],how="inner")
print("合并后的表形状为:",merge.shape)
print("合并后的表为:",merge)

5
Read.py Normal file
View File

@ -0,0 +1,5 @@
import pandas as pd
import numpy as np
inputfile = '5Preprocessing/teleco_camp_orig.csv' #销量数据路径
camp = pd.read_csv(inputfile) #读入数据
print('数据输出为:\n',camp.head(10))

7
Standard.py Normal file
View File

@ -0,0 +1,7 @@
import pandas as pd
model=pd.read_excel("5Preprocessing/model.xls")
def Standard(data):
data=(data-data.mean())/data.std()
return data
S=Standard(model)
print("标准化后的数据为:",'\n',S.head())

11
Standards-2.py Normal file
View File

@ -0,0 +1,11 @@
import pandas as pd
num=[200,300,400,600,1000]
df=pd.DataFrame(num)
print(df.describe())
df0=(df-df.min())/(df.max()-df.min()) #离差标准化
print(df0)
print(df.mean()) #标准差标准化
df1=(df-df.mean())/df.std()
print(df1)
df2=(df/(10**4)) #小数定标标准化
print(df2)

20
Standards.py Normal file
View File

@ -0,0 +1,20 @@
import pandas as pd
model=pd.read_excel("5Preprocessing/model.xls")
def Standard(data):
data=(data-data.mean())/data.std()
return data
S=Standard(model)
print("标准化后的数据为:",'\n',S.head())
def MinMaxScale(data):
data=(data-data.min())/(data.max()-data.min())
return data
M=MinMaxScale(model)
print("离差标准化后的数据为:",'\n',S.head())
import numpy as np
def DecimalScaler(data):
data=data/10**np.ceil(np.log10(data.abs().max()))
return data
D=DecimalScaler(model)
print("小数定标差标准化的数据为:",'\n',D.head())

16
main.py Normal file
View File

@ -0,0 +1,16 @@
# 这是一个示例 Python 脚本。
# 按 Shift+F10 执行或将其替换为您的代码。
# 按 双击 Shift 在所有地方搜索类、文件、工具窗口、操作和设置。
def print_hi(name):
# 在下面的代码行中使用断点来调试脚本。
print(f'Hi, {name}') # 按 Ctrl+F8 切换断点。
# 按装订区域中的绿色按钮以运行脚本。
if __name__ == '__main__':
print_hi('PyCharm')
# 访问 https://www.jetbrains.com/help/pycharm/ 获取 PyCharm 帮助