import re import os from pyspark import SparkContext # 初始化 SparkContext sc = SparkContext(appName="TextAnalysis") # 定义数据文件路径 GOOGLE_PATH = 'Google.csv' GOOGLE_SMALL_PATH = 'Google_small.csv' AMAZON_PATH = 'Amazon.csv' AMAZON_SMALL_PATH = 'Amazon_small.csv' STOPWORDS_PATH = 'stopwords.txt' # 定义正则表达式模式,用于解析数据行 DATAFILE_PATTERN = '^(.+),"(.+)",(.*),(.*),(.*)' def removeQuotes(s): """ 去掉输入字符串中的引号 """ return ''.join(i for i in s if i!='"') def parseDatafileLine(datafileLine): """ 解析数据文件中的每一行 """ match = re.search(DATAFILE_PATTERN, str(datafileLine)) if match is None: print('Invalid datafile line: %s' % datafileLine) return (datafileLine, -1) elif match.group(1) == '"id"': print('Header datafile line: %s' % datafileLine) return (datafileLine, 0) else: product = '%s %s %s' % (match.group(2), match.group(3), match.group(4)) return ((removeQuotes(match.group(1)), product), 1) def parseData(filename): """ 解析数据文件 """ return (sc .textFile(filename, 4, 0) .map(parseDatafileLine) .cache()) def loadData(path): """ 加载数据文件 """ filename = path raw = parseData(filename).cache() failed = (raw .filter(lambda s: s[1] == -1) .map(lambda s: s[0])) for line in failed.take(1): print ('{0} - Invalid datafile line: {1}'.format(path, line)) valid = (raw .filter(lambda s: s[1] == 1) .map(lambda s: s[0]) .cache()) print ('{0} - Read {1} lines, successfully parsed {2} lines, failed to parse {3} lines'.format(path,raw.count(),valid.count(),failed.count())) return valid # 加载数据 googleSmall = loadData(GOOGLE_SMALL_PATH) google = loadData(GOOGLE_PATH) amazonSmall = loadData(AMAZON_SMALL_PATH) amazon = loadData(AMAZON_PATH) # 打印部分数据以检查 for line in googleSmall.take(3): print ('google: %s: %s\n' % (line[0], line[1])) for line in amazonSmall.take(3): print ('amazon: %s: %s\n' % (line[0], line[1])) # 假设数据现在已经正确加载,你可以继续后续的分析