From 3f2ae8adf51d03e6069378ed411f92ad2411796e Mon Sep 17 00:00:00 2001 From: fly6516 Date: Sun, 30 Mar 2025 02:55:48 +0800 Subject: [PATCH] =?UTF-8?q?feat(preprocessing):=20=E6=B7=BB=E5=8A=A0?= =?UTF-8?q?=E6=95=B0=E6=8D=AE=E9=A2=84=E5=A4=84=E7=90=86=E7=A4=BA=E4=BE=8B?= =?UTF-8?q?=E8=84=9A=E6=9C=AC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 添加多个数据预处理示例脚本,包括离散化、去重、缺失值处理、插值、标准化等 - 新增 CheckNULL、Interpolation、Merge 等实用工具脚本 - 提供了不同的标准化方法示例,如标准差标准化、离差标准化、小数定标标准化 --- 4-2-1.py | 4 ++++ 4-2-2.py | 13 +++++++++++++ 4-2-3.py | 11 +++++++++++ CheckNULL-2.py | 5 +++++ CheckNULL.py | 5 +++++ Interpolation.py | 12 ++++++++++++ Merge.py | 8 ++++++++ Read.py | 5 +++++ Standard.py | 7 +++++++ Standards-2.py | 11 +++++++++++ Standards.py | 20 ++++++++++++++++++++ main.py | 16 ++++++++++++++++ 12 files changed, 117 insertions(+) create mode 100644 4-2-1.py create mode 100644 4-2-2.py create mode 100644 4-2-3.py create mode 100644 CheckNULL-2.py create mode 100644 CheckNULL.py create mode 100644 Interpolation.py create mode 100644 Merge.py create mode 100644 Read.py create mode 100644 Standard.py create mode 100644 Standards-2.py create mode 100644 Standards.py create mode 100644 main.py diff --git a/4-2-1.py b/4-2-1.py new file mode 100644 index 0000000..9ac8d99 --- /dev/null +++ b/4-2-1.py @@ -0,0 +1,4 @@ +import pandas as pd +num=[5,10,11,13,15,35,50,55,72,92,204,215] +price=pd.cut(num,3) +print(price.value_counts()) \ No newline at end of file diff --git a/4-2-2.py b/4-2-2.py new file mode 100644 index 0000000..2c58736 --- /dev/null +++ b/4-2-2.py @@ -0,0 +1,13 @@ +#等频离散化 +import pandas as pd +import numpy as np +num=[5,10,11,13,15,35,50,55,72,92,204,215] +k=4; #设置离散后的数据段为5 +temp=[ i/k*100 for i in range(k+1)] +print(temp) +w=[ ] +for item in temp: + w.append(np.percentile(num,item)) +w[0] = w[0]*(1-1e-10) +d2=pd.cut(num,w,labels=range(k)) +print(d2.value_counts()) \ No newline at end of file diff --git a/4-2-3.py b/4-2-3.py new file mode 100644 index 0000000..8774915 --- /dev/null +++ b/4-2-3.py @@ -0,0 +1,11 @@ +import pandas as pd +import numpy as np +inputfile = '5Preprocessing/1.xlsx' #销量数据路径 +s = pd.read_excel(inputfile) #读入数据 +print('数据输出为:\n',s) +#去除重复 +data1 = s['x'].drop_duplicates() +print('数据输出为:\n',data1) +#缺失值中位数填补 +data2=s['y'].fillna(s['y'].median()) +print(data2) \ No newline at end of file diff --git a/CheckNULL-2.py b/CheckNULL-2.py new file mode 100644 index 0000000..f67cc66 --- /dev/null +++ b/CheckNULL-2.py @@ -0,0 +1,5 @@ +import pandas as pd +import numpy as np +inputfile = '5Preprocessing/teleco_camp_orig.csv' #销量数据路径 +camp = pd.read_csv(inputfile) #读入数据 +print("lagrange 插值前(False 为缺失值所在位置)",'\n', camp.notnull()) \ No newline at end of file diff --git a/CheckNULL.py b/CheckNULL.py new file mode 100644 index 0000000..00b9d63 --- /dev/null +++ b/CheckNULL.py @@ -0,0 +1,5 @@ +import pandas as pd +import numpy as np +inputfile = '5Preprocessing/teleco_camp_orig.csv' #销量数据路径 +camp = pd.read_csv(inputfile) #读入数据 +print('缺失值的位置为:\n',camp.isnull().sum()) \ No newline at end of file diff --git a/Interpolation.py b/Interpolation.py new file mode 100644 index 0000000..5b3015f --- /dev/null +++ b/Interpolation.py @@ -0,0 +1,12 @@ +from scipy.interpolate import lagrange +import pandas as pd +import numpy as np + +inputfile = '5Preprocessing/teleco_camp_orig.csv' #销量数据路径 +camp = pd.read_csv(inputfile) #读入数据 +for i in range(2,4): + la=lagrange(camp.iloc[:,i].dropna().index,camp.iloc[:,i].dropna().values) + list_d=list(set(np.arange(0,20)).difference(set(camp.iloc[:,i].dropna().index))) + camp.iloc[list_d,i]=la(list_d) + print("第%d 列缺失值的个数为:%d"%(i,camp.iloc[:,i].isnull().sum())) +print("lagrange 插值后(False 为缺失值所在位置)\n",camp.notnull()) \ No newline at end of file diff --git a/Merge.py b/Merge.py new file mode 100644 index 0000000..843b5e1 --- /dev/null +++ b/Merge.py @@ -0,0 +1,8 @@ +import pandas as pd +ele_loss=pd.read_csv("5Preprocessing/ele_loss.csv") +alarm=pd.read_csv("5Preprocessing/alarm.csv", encoding='gbk') +print("ele_loss 表的形状为",ele_loss.shape) +print("alarm 表的形状为",alarm.shape) +merge=pd.merge(ele_loss,alarm,left_on=["ID","date"],right_on=["ID","date"],how="inner") +print("合并后的表形状为:",merge.shape) +print("合并后的表为:",merge) \ No newline at end of file diff --git a/Read.py b/Read.py new file mode 100644 index 0000000..84291de --- /dev/null +++ b/Read.py @@ -0,0 +1,5 @@ +import pandas as pd +import numpy as np +inputfile = '5Preprocessing/teleco_camp_orig.csv' #销量数据路径 +camp = pd.read_csv(inputfile) #读入数据 +print('数据输出为:\n',camp.head(10)) \ No newline at end of file diff --git a/Standard.py b/Standard.py new file mode 100644 index 0000000..904f694 --- /dev/null +++ b/Standard.py @@ -0,0 +1,7 @@ +import pandas as pd +model=pd.read_excel("5Preprocessing/model.xls") +def Standard(data): + data=(data-data.mean())/data.std() + return data +S=Standard(model) +print("标准化后的数据为:",'\n',S.head()) \ No newline at end of file diff --git a/Standards-2.py b/Standards-2.py new file mode 100644 index 0000000..f32ea0f --- /dev/null +++ b/Standards-2.py @@ -0,0 +1,11 @@ +import pandas as pd +num=[200,300,400,600,1000] +df=pd.DataFrame(num) +print(df.describe()) +df0=(df-df.min())/(df.max()-df.min()) #离差标准化 +print(df0) +print(df.mean()) #标准差标准化 +df1=(df-df.mean())/df.std() +print(df1) +df2=(df/(10**4)) #小数定标标准化 +print(df2) \ No newline at end of file diff --git a/Standards.py b/Standards.py new file mode 100644 index 0000000..3ae138e --- /dev/null +++ b/Standards.py @@ -0,0 +1,20 @@ +import pandas as pd +model=pd.read_excel("5Preprocessing/model.xls") +def Standard(data): + data=(data-data.mean())/data.std() + return data +S=Standard(model) +print("标准化后的数据为:",'\n',S.head()) + +def MinMaxScale(data): + data=(data-data.min())/(data.max()-data.min()) + return data +M=MinMaxScale(model) +print("离差标准化后的数据为:",'\n',S.head()) + +import numpy as np +def DecimalScaler(data): + data=data/10**np.ceil(np.log10(data.abs().max())) + return data +D=DecimalScaler(model) +print("小数定标差标准化的数据为:",'\n',D.head()) \ No newline at end of file diff --git a/main.py b/main.py new file mode 100644 index 0000000..eb389a0 --- /dev/null +++ b/main.py @@ -0,0 +1,16 @@ +# 这是一个示例 Python 脚本。 + +# 按 Shift+F10 执行或将其替换为您的代码。 +# 按 双击 Shift 在所有地方搜索类、文件、工具窗口、操作和设置。 + + +def print_hi(name): + # 在下面的代码行中使用断点来调试脚本。 + print(f'Hi, {name}') # 按 Ctrl+F8 切换断点。 + + +# 按装订区域中的绿色按钮以运行脚本。 +if __name__ == '__main__': + print_hi('PyCharm') + +# 访问 https://www.jetbrains.com/help/pycharm/ 获取 PyCharm 帮助