- 移除了不必要的导入和未使用的代码 - 新增了 parseData 和 loadData 函数,用于解析和加载数据文件 - 优化了数据解析的正则表达式和逻辑 - 简化了代码结构,提高了可读性和可维护性
72 lines
2.2 KiB
Python
72 lines
2.2 KiB
Python
import re
|
|
import os
|
|
from pyspark import SparkContext
|
|
|
|
# 初始化 SparkContext
|
|
sc = SparkContext(appName="TextAnalysis")
|
|
|
|
# 定义数据文件路径
|
|
GOOGLE_PATH = 'Google.csv'
|
|
GOOGLE_SMALL_PATH = 'Google_small.csv'
|
|
AMAZON_PATH = 'Amazon.csv'
|
|
AMAZON_SMALL_PATH = 'Amazon_small.csv'
|
|
STOPWORDS_PATH = 'stopwords.txt'
|
|
|
|
# 定义正则表达式模式,用于解析数据行
|
|
DATAFILE_PATTERN = '^(.+),"(.+)",(.*),(.*),(.*)'
|
|
|
|
def removeQuotes(s):
|
|
""" 去掉输入字符串中的引号 """
|
|
return ''.join(i for i in s if i!='"')
|
|
|
|
def parseDatafileLine(datafileLine):
|
|
""" 解析数据文件中的每一行 """
|
|
match = re.search(DATAFILE_PATTERN, str(datafileLine))
|
|
if match is None:
|
|
print('Invalid datafile line: %s' % datafileLine)
|
|
return (datafileLine, -1)
|
|
elif match.group(1) == '"id"':
|
|
print('Header datafile line: %s' % datafileLine)
|
|
return (datafileLine, 0)
|
|
else:
|
|
product = '%s %s %s' % (match.group(2), match.group(3), match.group(4))
|
|
return ((removeQuotes(match.group(1)), product), 1)
|
|
|
|
def parseData(filename):
|
|
""" 解析数据文件 """
|
|
return (sc
|
|
.textFile(filename, 4, 0)
|
|
.map(parseDatafileLine)
|
|
.cache())
|
|
|
|
def loadData(path):
|
|
""" 加载数据文件 """
|
|
filename = path
|
|
raw = parseData(filename).cache()
|
|
failed = (raw
|
|
.filter(lambda s: s[1] == -1)
|
|
.map(lambda s: s[0]))
|
|
for line in failed.take(1):
|
|
print ('{0} - Invalid datafile line: {1}'.format(path, line))
|
|
valid = (raw
|
|
.filter(lambda s: s[1] == 1)
|
|
.map(lambda s: s[0])
|
|
.cache())
|
|
print ('{0} - Read {1} lines, successfully parsed {2} lines, failed to parse {3} lines'.format(path,raw.count(),valid.count(),failed.count()))
|
|
return valid
|
|
|
|
# 加载数据
|
|
googleSmall = loadData(GOOGLE_SMALL_PATH)
|
|
google = loadData(GOOGLE_PATH)
|
|
amazonSmall = loadData(AMAZON_SMALL_PATH)
|
|
amazon = loadData(AMAZON_PATH)
|
|
|
|
# 打印部分数据以检查
|
|
for line in googleSmall.take(3):
|
|
print ('google: %s: %s\n' % (line[0], line[1]))
|
|
|
|
for line in amazonSmall.take(3):
|
|
print ('amazon: %s: %s\n' % (line[0], line[1]))
|
|
|
|
# 假设数据现在已经正确加载,你可以继续后续的分析
|