113 lines
		
	
	
		
			3.4 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
		
		
			
		
	
	
			113 lines
		
	
	
		
			3.4 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| 
								 | 
							
								import sys
							 | 
						||
| 
								 | 
							
								from datetime import datetime, timedelta
							 | 
						||
| 
								 | 
							
								from pyspark.sql import SparkSession
							 | 
						||
| 
								 | 
							
								from pyspark.sql.functions import col, sum as _sum, dayofmonth, date_format
							 | 
						||
| 
								 | 
							
								import happybase
							 | 
						||
| 
								 | 
							
								import matplotlib.pyplot as plt
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								def convert_date(date_string):
							 | 
						||
| 
								 | 
							
								    return f"{date_string[:4]}-{date_string[4:6]}-{date_string[6:]}"
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								def convert_time(time_string):
							 | 
						||
| 
								 | 
							
								    return f"{time_string[:2]}:{time_string[2:4]}:{time_string[4:]}"
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								def get_region_code(dateline):
							 | 
						||
| 
								 | 
							
								    year = dateline[:4]
							 | 
						||
| 
								 | 
							
								    region_num = (int(year) - 2006) % 6
							 | 
						||
| 
								 | 
							
								    return str(region_num)
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								def get_month_dates(input_date):
							 | 
						||
| 
								 | 
							
								    date = datetime.strptime(input_date, "%Y%m")
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    start_of_month = date.replace(day=1)
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    next_month = start_of_month.replace(month=start_of_month.month % 12 + 1,
							 | 
						||
| 
								 | 
							
								                                        year=start_of_month.year + (start_of_month.month // 12))
							 | 
						||
| 
								 | 
							
								    end_of_month = next_month - timedelta(days=1)
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    start_of_month_str = start_of_month.strftime("%Y%m%d")
							 | 
						||
| 
								 | 
							
								    end_of_month_str = end_of_month.strftime("%Y%m%d")
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    return start_of_month_str, end_of_month_str
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								def get_double_rowkey(input_date):
							 | 
						||
| 
								 | 
							
								    split = "_"
							 | 
						||
| 
								 | 
							
								    start_date, stop_date = get_month_dates(input_date)
							 | 
						||
| 
								 | 
							
								    start_datetime = start_date + "000000"
							 | 
						||
| 
								 | 
							
								    stop_datetime = stop_date + "235900"
							 | 
						||
| 
								 | 
							
								    start_region_code = get_region_code(start_datetime)
							 | 
						||
| 
								 | 
							
								    stop_region_code = get_region_code(stop_datetime)
							 | 
						||
| 
								 | 
							
								    return start_region_code + split + start_datetime, stop_region_code + split + stop_datetime
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								spark = SparkSession.builder \
							 | 
						||
| 
								 | 
							
								    .appName("HBaseSparkIntegration") \
							 | 
						||
| 
								 | 
							
								    .getOrCreate()
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								hbase_host = "100.64.0.3"
							 | 
						||
| 
								 | 
							
								hbase_table = "elec:eleclog"
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								connection = happybase.Connection(hbase_host)
							 | 
						||
| 
								 | 
							
								table = connection.table(hbase_table)
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								column_family = "info"
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								input_date = input("input date (200701):")
							 | 
						||
| 
								 | 
							
								# input_date = "20061230"
							 | 
						||
| 
								 | 
							
								start_row, stop_row = get_double_rowkey(input_date)
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								print(start_row, stop_row)
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								def fetch_hbase_data(table):
							 | 
						||
| 
								 | 
							
								    none_num = 0
							 | 
						||
| 
								 | 
							
								    rows = []
							 | 
						||
| 
								 | 
							
								    for key, data in table.scan(row_start=start_row.encode("utf-8"), row_stop=stop_row.encode("utf-8")):
							 | 
						||
| 
								 | 
							
								        date = data[f'{column_family}:date'.encode('utf-8')].decode('utf-8')
							 | 
						||
| 
								 | 
							
								        time = data[f'{column_family}:time'.encode('utf-8')].decode('utf-8')
							 | 
						||
| 
								 | 
							
								        value = data[f'{column_family}:globalActivePower'.encode('utf-8')].decode('utf-8')
							 | 
						||
| 
								 | 
							
								        if value == "?":
							 | 
						||
| 
								 | 
							
								            none_num += 1
							 | 
						||
| 
								 | 
							
								            continue
							 | 
						||
| 
								 | 
							
								        global_active_power = float(value)
							 | 
						||
| 
								 | 
							
								        datetime = f"{convert_date(date)} {convert_time(time)}"
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        rows.append((datetime, global_active_power))
							 | 
						||
| 
								 | 
							
								    return rows, none_num
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								hbase_data, none_num = fetch_hbase_data(table)
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								if len(hbase_data) == 0:
							 | 
						||
| 
								 | 
							
								    print("No data searched, please confirm your input")
							 | 
						||
| 
								 | 
							
								    sys.exit(0)
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								if none_num / len(hbase_data) >= 0.20:
							 | 
						||
| 
								 | 
							
								    print("This batch data has too many nulls to be of analytical value ")
							 | 
						||
| 
								 | 
							
								    sys.exit(0)
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								columns = ["datetime", "globalActivePower"]
							 | 
						||
| 
								 | 
							
								df = spark.createDataFrame(hbase_data, columns)
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								df = df.withColumn("date", date_format(col("datetime"), "yyyy-MM-dd"))
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								daily_total_power = df.groupBy("date").agg((_sum("globalActivePower") / (60*24)).alias("daily_total_power"))
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								daily_total_power_pd = daily_total_power.orderBy("date").toPandas()
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								plt.figure(figsize=(10, 6))
							 | 
						||
| 
								 | 
							
								plt.plot(daily_total_power_pd['date'], daily_total_power_pd['daily_total_power'], marker='o')
							 | 
						||
| 
								 | 
							
								plt.title('Daily Power Consumption for One Mouth')
							 | 
						||
| 
								 | 
							
								plt.xlabel('Date')
							 | 
						||
| 
								 | 
							
								plt.ylabel('Total Power Consumption (kW)')
							 | 
						||
| 
								 | 
							
								plt.grid(True)
							 | 
						||
| 
								 | 
							
								plt.xticks(rotation=45)
							 | 
						||
| 
								 | 
							
								plt.tight_layout()
							 | 
						||
| 
								 | 
							
								plt.show()
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								spark.stop()
							 |