aiwf
Version:
AI Workflow Framework for Claude Code with multi-language support (Korean/English)
1,057 lines (879 loc) • 34.1 kB
Markdown
# Data Analyst 페르소나 Knowledge Base
## 데이터 분석 도구 및 라이브러리
### Python 데이터 분석 스택
#### Pandas 고급 기능
```python
import pandas as pd
import numpy as np
# 1. 멀티 인덱싱
df = pd.DataFrame({
'date': pd.date_range('2024-01-01', periods=100),
'category': np.random.choice(['A', 'B', 'C'], 100),
'subcategory': np.random.choice(['X', 'Y'], 100),
'value': np.random.randn(100),
'quantity': np.random.randint(1, 100, 100)
})
# 멀티 인덱스 생성
df_multi = df.set_index(['category', 'subcategory'])
# 멀티 인덱스 접근
df_multi.loc[('A', 'X')] # 특정 조합
df_multi.xs('X', level='subcategory') # 특정 레벨
# 2. Window Functions
# 이동 평균
df['ma_7'] = df.groupby('category')['value'].transform(
lambda x: x.rolling(7, min_periods=1).mean()
)
# 누적 합계
df['cumsum'] = df.groupby('category')['value'].cumsum()
# Expanding window
df['expanding_mean'] = df.groupby('category')['value'].expanding().mean()
# 3. 피벗 테이블 고급 기능
pivot_table = pd.pivot_table(
df,
values=['value', 'quantity'],
index=['date'],
columns=['category'],
aggfunc={
'value': ['mean', 'std'],
'quantity': 'sum'
},
fill_value=0,
margins=True,
margins_name='Total'
)
# 4. 시계열 리샘플링
df_ts = df.set_index('date')
monthly_summary = df_ts.resample('M').agg({
'value': ['sum', 'mean', 'std'],
'quantity': 'sum'
})
# 5. 메모리 최적화
def optimize_dataframe(df):
"""데이터프레임 메모리 사용량 최적화"""
for col in df.columns:
col_type = df[col].dtype
if col_type != 'object':
c_min = df[col].min()
c_max = df[col].max()
if str(col_type)[:3] == 'int':
if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
df[col] = df[col].astype(np.int8)
elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
df[col] = df[col].astype(np.int16)
elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
df[col] = df[col].astype(np.int32)
else:
if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
df[col] = df[col].astype(np.float16)
elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
df[col] = df[col].astype(np.float32)
else:
df[col] = df[col].astype('category')
return df
```
#### NumPy 고급 연산
```python
# 1. Broadcasting
a = np.array([[1, 2, 3], [4, 5, 6]])
b = np.array([10, 20, 30])
result = a + b # Broadcasting 적용
# 2. 벡터화 연산
def custom_function(x):
if x > 0:
return np.log(x)
else:
return 0
vectorized_result = custom_function(np.array([-1, 0, 1, 2, 3]))
# 3. 고급 인덱싱
arr = np.random.randn(10, 10)
mask = arr > 0
positive_values = arr[mask]
# Fancy indexing
rows = np.array([0, 2, 4])
cols = np.array([1, 3, 5])
selected = arr[rows[:, np.newaxis], cols]
# 4. 선형대수 연산
# 고유값 분해
eigenvalues, eigenvectors = np.linalg.eig(arr)
# 특이값 분해 (SVD)
U, s, Vt = np.linalg.svd(arr)
# 5. 통계 함수
# 백분위수
percentiles = np.percentile(arr, [25, 50, 75], axis=0)
# 상관계수 행렬
correlation_matrix = np.corrcoef(arr.T)
```
### SQL 고급 쿼리
#### 윈도우 함수
```sql
-- 1. 순위 함수
WITH sales_ranking AS (
SELECT
product_id,
sale_date,
amount,
ROW_NUMBER() OVER (PARTITION BY product_id ORDER BY amount DESC) as row_num,
RANK() OVER (PARTITION BY product_id ORDER BY amount DESC) as rank,
DENSE_RANK() OVER (PARTITION BY product_id ORDER BY amount DESC) as dense_rank,
PERCENT_RANK() OVER (PARTITION BY product_id ORDER BY amount DESC) as percent_rank
FROM sales
)
SELECT * FROM sales_ranking WHERE row_num <= 10;
-- 2. 이동 평균 및 누적 합계
SELECT
date,
revenue,
-- 7일 이동 평균
AVG(revenue) OVER (
ORDER BY date
ROWS BETWEEN 6 PRECEDING AND CURRENT ROW
) as ma_7,
-- 누적 합계
SUM(revenue) OVER (
ORDER BY date
ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
) as cumulative_revenue,
-- 전월 대비 성장률
LAG(revenue, 1) OVER (ORDER BY date) as prev_revenue,
(revenue - LAG(revenue, 1) OVER (ORDER BY date)) /
NULLIF(LAG(revenue, 1) OVER (ORDER BY date), 0) * 100 as growth_rate
FROM daily_revenue;
-- 3. 고급 집계
WITH monthly_stats AS (
SELECT
DATE_TRUNC('month', order_date) as month,
category,
COUNT(DISTINCT customer_id) as unique_customers,
COUNT(*) as total_orders,
SUM(amount) as total_revenue,
AVG(amount) as avg_order_value,
STDDEV(amount) as stddev_order_value
FROM orders
GROUP BY DATE_TRUNC('month', order_date), category
),
category_percentiles AS (
SELECT
month,
category,
total_revenue,
PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY total_revenue)
OVER (PARTITION BY month) as q1,
PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY total_revenue)
OVER (PARTITION BY month) as median,
PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY total_revenue)
OVER (PARTITION BY month) as q3
FROM monthly_stats
)
SELECT * FROM category_percentiles;
-- 4. 재귀 CTE (계층 구조)
WITH RECURSIVE category_hierarchy AS (
-- Base case
SELECT
id,
name,
parent_id,
0 as level,
name as path
FROM categories
WHERE parent_id IS NULL
UNION ALL
-- Recursive case
SELECT
c.id,
c.name,
c.parent_id,
ch.level + 1,
ch.path || ' > ' || c.name
FROM categories c
JOIN category_hierarchy ch ON c.parent_id = ch.id
)
SELECT * FROM category_hierarchy ORDER BY path;
```
### 통계 분석 기법
#### 가설 검정
```python
from scipy import stats
import statsmodels.api as sm
from statsmodels.stats.multicomp import pairwise_tukeyhsd
class StatisticalTesting:
def normality_tests(data):
"""정규성 검정"""
results = {}
# Shapiro-Wilk test
stat, p_value = stats.shapiro(data)
results['shapiro'] = {
'statistic': stat,
'p_value': p_value,
'is_normal': p_value > 0.05
}
# Kolmogorov-Smirnov test
stat, p_value = stats.kstest(data, 'norm',
args=(data.mean(), data.std()))
results['ks_test'] = {
'statistic': stat,
'p_value': p_value,
'is_normal': p_value > 0.05
}
# Anderson-Darling test
result = stats.anderson(data)
results['anderson'] = {
'statistic': result.statistic,
'critical_values': result.critical_values,
'significance_levels': result.significance_level
}
return results
def variance_tests(group1, group2):
"""등분산성 검정"""
# Levene's test
stat, p_value = stats.levene(group1, group2)
return {
'levene': {
'statistic': stat,
'p_value': p_value,
'equal_variance': p_value > 0.05
}
}
def anova_analysis(groups, labels):
"""ANOVA 분석"""
# One-way ANOVA
f_stat, p_value = stats.f_oneway(*groups)
# Post-hoc test (Tukey HSD)
data_combined = np.concatenate(groups)
labels_combined = np.concatenate([
[label] * len(group)
for label, group in zip(labels, groups)
])
tukey_result = pairwise_tukeyhsd(
data_combined,
labels_combined,
alpha=0.05
)
return {
'anova': {
'f_statistic': f_stat,
'p_value': p_value,
'significant': p_value < 0.05
},
'post_hoc': str(tukey_result)
}
```
#### 회귀 분석
```python
class RegressionAnalysis:
def __init__(self, X, y):
self.X = X
self.y = y
self.model = None
self.results = None
def linear_regression(self):
"""선형 회귀 분석"""
# 상수항 추가
X_with_const = sm.add_constant(self.X)
# 모델 적합
self.model = sm.OLS(self.y, X_with_const)
self.results = self.model.fit()
# 결과 요약
summary = {
'r_squared': self.results.rsquared,
'adj_r_squared': self.results.rsquared_adj,
'f_statistic': self.results.fvalue,
'f_pvalue': self.results.f_pvalue,
'coefficients': dict(zip(
self.X.columns,
self.results.params[1:]
)),
'p_values': dict(zip(
self.X.columns,
self.results.pvalues[1:]
)),
'confidence_intervals': self.results.conf_int()[1:].values.tolist()
}
return summary
def check_assumptions(self):
"""회귀 가정 검증"""
residuals = self.results.resid
fitted = self.results.fittedvalues
# 1. 선형성 검정
linearity_test = stats.pearsonr(fitted, self.y)[1] < 0.05
# 2. 정규성 검정
_, normality_pvalue = stats.jarque_bera(residuals)
# 3. 등분산성 검정 (Breusch-Pagan test)
bp_test = sm.stats.diagnostic.het_breuschpagan(
residuals, self.results.model.exog
)
# 4. 자기상관 검정 (Durbin-Watson)
dw_stat = sm.stats.stattools.durbin_watson(residuals)
# 5. 다중공선성 (VIF)
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif_data = pd.DataFrame()
vif_data["Variable"] = self.X.columns
vif_data["VIF"] = [
variance_inflation_factor(self.X.values, i)
for i in range(self.X.shape[1])
]
return {
'linearity': linearity_test,
'normality': normality_pvalue > 0.05,
'homoscedasticity': bp_test[1] > 0.05,
'autocorrelation': 1.5 < dw_stat < 2.5,
'multicollinearity': vif_data.to_dict()
}
```
### 시계열 분석
#### 시계열 분해 및 예측
```python
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import adfuller, kpss
from statsmodels.tsa.arima.model import ARIMA
from prophet import Prophet
class TimeSeriesAnalysis:
def __init__(self, ts_data, date_col, value_col):
self.data = ts_data.set_index(date_col)[value_col].sort_index()
self.decomposition = None
def test_stationarity(self):
"""정상성 검정"""
# ADF Test
adf_result = adfuller(self.data, autolag='AIC')
# KPSS Test
kpss_result = kpss(self.data, regression='c', nlags='auto')
return {
'adf': {
'statistic': adf_result[0],
'p_value': adf_result[1],
'is_stationary': adf_result[1] < 0.05
},
'kpss': {
'statistic': kpss_result[0],
'p_value': kpss_result[1],
'is_stationary': kpss_result[1] > 0.05
}
}
def decompose_series(self, model='additive', period=None):
"""시계열 분해"""
self.decomposition = seasonal_decompose(
self.data,
model=model,
period=period,
extrapolate_trend='freq'
)
return {
'trend': self.decomposition.trend,
'seasonal': self.decomposition.seasonal,
'residual': self.decomposition.resid
}
def fit_arima(self, order=(1,1,1), seasonal_order=(0,0,0,0)):
"""ARIMA 모델 적합"""
model = ARIMA(self.data, order=order, seasonal_order=seasonal_order)
fitted_model = model.fit()
# 예측
forecast = fitted_model.forecast(steps=30)
return {
'model': fitted_model,
'aic': fitted_model.aic,
'bic': fitted_model.bic,
'forecast': forecast,
'summary': fitted_model.summary()
}
def prophet_forecast(self, periods=30):
"""Prophet 예측"""
# 데이터 준비
prophet_data = pd.DataFrame({
'ds': self.data.index,
'y': self.data.values
})
# 모델 생성 및 학습
model = Prophet(
yearly_seasonality=True,
weekly_seasonality=True,
daily_seasonality=False,
changepoint_prior_scale=0.05
)
model.fit(prophet_data)
# 예측
future = model.make_future_dataframe(periods=periods)
forecast = model.predict(future)
return {
'model': model,
'forecast': forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']],
'components': model.plot_components(forecast)
}
```
### 머신러닝 기법
#### 특성 공학
```python
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif, RFE
from sklearn.decomposition import PCA
class FeatureEngineering:
def __init__(self, X, y):
self.X = X
self.y = y
def create_polynomial_features(self, degree=2):
"""다항 특성 생성"""
poly = PolynomialFeatures(degree=degree, include_bias=False)
X_poly = poly.fit_transform(self.X)
feature_names = poly.get_feature_names_out(self.X.columns)
return pd.DataFrame(X_poly, columns=feature_names, index=self.X.index)
def create_interaction_features(self):
"""상호작용 특성 생성"""
interactions = pd.DataFrame(index=self.X.index)
for i, col1 in enumerate(self.X.columns):
for col2 in self.X.columns[i+1:]:
# 곱셈 상호작용
interactions[f'{col1}_x_{col2}'] = self.X[col1] * self.X[col2]
# 나눗셈 상호작용 (0 방지)
denominator = self.X[col2].replace(0, np.nan)
interactions[f'{col1}_div_{col2}'] = self.X[col1] / denominator
return interactions
def create_lag_features(self, n_lags=3):
"""시차 특성 생성"""
lag_features = pd.DataFrame(index=self.X.index)
for col in self.X.columns:
for lag in range(1, n_lags + 1):
lag_features[f'{col}_lag{lag}'] = self.X[col].shift(lag)
return lag_features
def select_features(self, method='univariate', n_features=10):
"""특성 선택"""
if method == 'univariate':
# Univariate selection
selector = SelectKBest(score_func=f_classif, k=n_features)
selector.fit(self.X, self.y)
selected_features = self.X.columns[selector.get_support()]
scores = pd.DataFrame({
'feature': self.X.columns,
'score': selector.scores_
}).sort_values('score', ascending=False)
elif method == 'rfe':
# Recursive Feature Elimination
from sklearn.ensemble import RandomForestClassifier
estimator = RandomForestClassifier(n_estimators=100, random_state=42)
selector = RFE(estimator, n_features_to_select=n_features)
selector.fit(self.X, self.y)
selected_features = self.X.columns[selector.support_]
scores = pd.DataFrame({
'feature': self.X.columns,
'ranking': selector.ranking_
}).sort_values('ranking')
return selected_features, scores
def dimensionality_reduction(self, n_components=0.95):
"""차원 축소"""
# 표준화
scaler = StandardScaler()
X_scaled = scaler.fit_transform(self.X)
# PCA
pca = PCA(n_components=n_components)
X_pca = pca.fit_transform(X_scaled)
# 주성분 기여도
component_importance = pd.DataFrame({
'component': [f'PC{i+1}' for i in range(len(pca.components_))],
'explained_variance_ratio': pca.explained_variance_ratio_,
'cumulative_variance_ratio': np.cumsum(pca.explained_variance_ratio_)
})
# 특성 중요도
feature_importance = pd.DataFrame(
pca.components_.T,
columns=[f'PC{i+1}' for i in range(len(pca.components_))],
index=self.X.columns
)
return {
'transformed_data': X_pca,
'component_importance': component_importance,
'feature_importance': feature_importance,
'n_components': pca.n_components_
}
```
#### 클러스터링 분석
```python
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.metrics import silhouette_score, calinski_harabasz_score
class ClusteringAnalysis:
def __init__(self, X):
self.X = X
self.scaler = StandardScaler()
self.X_scaled = self.scaler.fit_transform(X)
def find_optimal_clusters(self, max_k=10):
"""최적 클러스터 수 찾기"""
metrics = {
'k': [],
'inertia': [],
'silhouette': [],
'calinski_harabasz': []
}
for k in range(2, max_k + 1):
kmeans = KMeans(n_clusters=k, random_state=42)
labels = kmeans.fit_predict(self.X_scaled)
metrics['k'].append(k)
metrics['inertia'].append(kmeans.inertia_)
metrics['silhouette'].append(silhouette_score(self.X_scaled, labels))
metrics['calinski_harabasz'].append(
calinski_harabasz_score(self.X_scaled, labels)
)
# Elbow method
differences = np.diff(metrics['inertia'])
elbow_point = np.argmax(differences[1:] - differences[:-1]) + 2
return pd.DataFrame(metrics), elbow_point
def perform_clustering(self, method='kmeans', **params):
"""클러스터링 수행"""
if method == 'kmeans':
model = KMeans(**params)
elif method == 'dbscan':
model = DBSCAN(**params)
elif method == 'hierarchical':
model = AgglomerativeClustering(**params)
labels = model.fit_predict(self.X_scaled)
# 클러스터 프로파일
cluster_profiles = []
for cluster in np.unique(labels):
if cluster != -1: # DBSCAN의 noise points 제외
cluster_mask = labels == cluster
profile = {
'cluster': cluster,
'size': np.sum(cluster_mask),
'percentage': np.sum(cluster_mask) / len(labels) * 100
}
# 각 특성의 평균값
for col in self.X.columns:
profile[f'{col}_mean'] = self.X.loc[cluster_mask, col].mean()
profile[f'{col}_std'] = self.X.loc[cluster_mask, col].std()
cluster_profiles.append(profile)
return {
'labels': labels,
'model': model,
'profiles': pd.DataFrame(cluster_profiles),
'silhouette_score': silhouette_score(self.X_scaled, labels)
if len(np.unique(labels)) > 1 else None
}
```
### 데이터 시각화 고급 기법
#### 고급 시각화 패턴
```python
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.patches import Rectangle
import matplotlib.patches as mpatches
class AdvancedVisualization:
def __init__(self, style='seaborn'):
plt.style.use(style)
self.colors = sns.color_palette('husl', 10)
def create_waterfall_chart(self, categories, values, title="Waterfall Chart"):
"""워터폴 차트"""
fig, ax = plt.subplots(figsize=(12, 8))
# 누적 계산
cumulative = np.cumsum(values)
cumulative = np.concatenate([[0], cumulative])
# 막대 그리기
for i, (cat, val) in enumerate(zip(categories, values)):
if val >= 0:
color = 'green'
bottom = cumulative[i]
else:
color = 'red'
bottom = cumulative[i+1]
ax.bar(i, abs(val), bottom=bottom, color=color,
edgecolor='black', linewidth=1)
# 값 표시
ax.text(i, bottom + abs(val)/2, f'{val:,.0f}',
ha='center', va='center', fontweight='bold')
# 연결선
for i in range(len(categories)):
if i < len(categories) - 1:
ax.plot([i+0.4, i+0.6], [cumulative[i+1], cumulative[i+1]],
'k--', alpha=0.5)
ax.set_xticks(range(len(categories)))
ax.set_xticklabels(categories, rotation=45, ha='right')
ax.set_title(title, fontsize=16, fontweight='bold')
ax.set_ylabel('Value', fontsize=14)
# 그리드
ax.yaxis.grid(True, alpha=0.3)
ax.set_axisbelow(True)
plt.tight_layout()
return fig
def create_bullet_chart(self, value, target, ranges, title=""):
"""불릿 차트"""
fig, ax = plt.subplots(figsize=(8, 3))
# 범위 그리기
colors = ['#d3d3d3', '#a9a9a9', '#808080']
for i, (start, end) in enumerate(ranges):
ax.barh(0, end - start, left=start, height=1,
color=colors[i], alpha=0.5)
# 실제 값
ax.barh(0, value, height=0.5, color='black')
# 목표 값
ax.plot([target, target], [-0.6, 0.6], 'r-', linewidth=3)
ax.set_xlim(0, max(ranges[-1][1], value, target) * 1.1)
ax.set_ylim(-1, 1)
ax.set_yticks([])
ax.set_title(title, fontsize=14, fontweight='bold')
# 레이블
ax.text(value/2, 0, f'{value:,.0f}',
ha='center', va='center', color='white', fontweight='bold')
ax.text(target, -0.8, f'Target: {target:,.0f}',
ha='center', fontsize=10)
return fig
def create_sankey_diagram(self, source, target, value, labels):
"""생키 다이어그램"""
import plotly.graph_objects as go
# 노드 인덱스 매핑
all_nodes = list(set(source + target))
node_indices = {node: i for i, node in enumerate(all_nodes)}
source_indices = [node_indices[s] for s in source]
target_indices = [node_indices[t] for t in target]
fig = go.Figure(data=[go.Sankey(
node=dict(
pad=15,
thickness=20,
line=dict(color="black", width=0.5),
label=all_nodes,
color="blue"
),
link=dict(
source=source_indices,
target=target_indices,
value=value,
color="rgba(0,0,255,0.4)"
)
)])
fig.update_layout(
title_text="Sankey Diagram",
font_size=10,
height=600
)
return fig
def create_radar_chart(self, categories, values_dict, title="Radar Chart"):
"""레이더 차트 (여러 그룹 비교)"""
angles = np.linspace(0, 2 * np.pi, len(categories), endpoint=False).tolist()
angles += angles[:1]
fig, ax = plt.subplots(figsize=(8, 8), subplot_kw=dict(projection='polar'))
for label, values in values_dict.items():
values = values.tolist()
values += values[:1]
ax.plot(angles, values, 'o-', linewidth=2, label=label)
ax.fill(angles, values, alpha=0.25)
ax.set_theta_offset(np.pi / 2)
ax.set_theta_direction(-1)
ax.set_thetagrids(np.degrees(angles[:-1]), categories)
ax.set_ylim(0, max([max(v) for v in values_dict.values()]) * 1.1)
ax.set_title(title, fontsize=16, fontweight='bold', pad=20)
ax.legend(loc='upper right', bbox_to_anchor=(1.3, 1.0))
return fig
```
### 리포트 자동화
#### 자동 리포트 생성 시스템
```python
from datetime import datetime
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import markdown
from weasyprint import HTML, CSS
class AutomatedReportGenerator:
def __init__(self, template_engine='jinja2'):
self.template_engine = template_engine
self.sections = []
def add_section(self, title, content, section_type='text'):
"""섹션 추가"""
self.sections.append({
'title': title,
'content': content,
'type': section_type,
'timestamp': datetime.now()
})
def generate_pdf_report(self, filename='report.pdf'):
"""PDF 리포트 생성"""
with PdfPages(filename) as pdf:
# 표지 페이지
self._create_cover_page(pdf)
# 목차
self._create_table_of_contents(pdf)
# 섹션별 내용
for i, section in enumerate(self.sections):
if section['type'] == 'text':
self._add_text_page(pdf, section)
elif section['type'] == 'chart':
self._add_chart_page(pdf, section)
elif section['type'] == 'table':
self._add_table_page(pdf, section)
# 메타데이터
d = pdf.infodict()
d['Title'] = 'Automated Data Analysis Report'
d['Author'] = 'Data Analysis System'
d['Subject'] = 'Data Analysis'
d['Keywords'] = 'Data, Analysis, Report'
d['CreationDate'] = datetime.now()
def _create_cover_page(self, pdf):
"""표지 페이지 생성"""
fig = plt.figure(figsize=(8.5, 11))
fig.text(0.5, 0.6, 'Data Analysis Report',
ha='center', va='center', fontsize=24, fontweight='bold')
fig.text(0.5, 0.5, f'Generated on {datetime.now().strftime("%Y-%m-%d")}',
ha='center', va='center', fontsize=16)
fig.text(0.5, 0.2, 'Automated Report Generation System',
ha='center', va='center', fontsize=12, style='italic')
pdf.savefig(fig, bbox_inches='tight')
plt.close()
def generate_html_report(self, filename='report.html'):
"""HTML 리포트 생성"""
html_template = """
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<title>Data Analysis Report</title>
<style>
body {{ font-family: Arial, sans-serif; margin: 40px; }}
h1 {{ color: #333; border-bottom: 2px solid #333; }}
h2 {{ color: #666; }}
.chart {{ margin: 20px 0; text-align: center; }}
.table {{ margin: 20px 0; }}
table {{ border-collapse: collapse; width: 100%; }}
th, td {{ border: 1px solid #ddd; padding: 8px; text-align: left; }}
th {{ background-color: #f2f2f2; font-weight: bold; }}
.summary-box {{
background-color: #f0f0f0;
padding: 15px;
border-radius: 5px;
margin: 20px 0;
}}
.metric {{
display: inline-block;
margin: 10px 20px;
text-align: center;
}}
.metric-value {{
font-size: 24px;
font-weight: bold;
color: #2c3e50;
}}
.metric-label {{
font-size: 14px;
color: #7f8c8d;
}}
</style>
</head>
<body>
<h1>Data Analysis Report</h1>
<p>Generated on: {date}</p>
{content}
</body>
</html>
"""
content = self._generate_html_content()
html = html_template.format(
date=datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
content=content
)
with open(filename, 'w', encoding='utf-8') as f:
f.write(html)
def _generate_html_content(self):
"""HTML 콘텐츠 생성"""
content_parts = []
for section in self.sections:
if section['type'] == 'text':
content_parts.append(f"<h2>{section['title']}</h2>")
content_parts.append(f"<p>{section['content']}</p>")
elif section['type'] == 'metrics':
content_parts.append(f"<h2>{section['title']}</h2>")
content_parts.append('<div class="summary-box">')
for metric, value in section['content'].items():
content_parts.append(f'''
<div class="metric">
<div class="metric-value">{value}</div>
<div class="metric-label">{metric}</div>
</div>
''')
content_parts.append('</div>')
elif section['type'] == 'table':
content_parts.append(f"<h2>{section['title']}</h2>")
content_parts.append('<div class="table">')
content_parts.append(section['content'].to_html(index=False))
content_parts.append('</div>')
return '\n'.join(content_parts)
```
### 비즈니스 인텔리전스
#### KPI 대시보드
```python
class KPIDashboard:
def __init__(self, data):
self.data = data
self.kpis = {}
def calculate_kpis(self):
"""주요 KPI 계산"""
self.kpis = {
'revenue': {
'current': self.data['revenue'].sum(),
'previous': self.data['revenue_prev'].sum(),
'growth': self._calculate_growth('revenue', 'revenue_prev'),
'target': self.data['revenue_target'].sum()
},
'customers': {
'total': self.data['customer_id'].nunique(),
'new': self.data[self.data['is_new_customer']]['customer_id'].nunique(),
'retention_rate': self._calculate_retention_rate(),
'churn_rate': self._calculate_churn_rate()
},
'operations': {
'avg_order_value': self.data['revenue'].mean(),
'conversion_rate': self._calculate_conversion_rate(),
'fulfillment_time': self.data['fulfillment_hours'].mean()
}
}
def _calculate_growth(self, current_col, previous_col):
"""성장률 계산"""
current = self.data[current_col].sum()
previous = self.data[previous_col].sum()
if previous == 0:
return float('inf')
return ((current - previous) / previous) * 100
def create_scorecard(self):
"""스코어카드 생성"""
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()
# Revenue KPI
self._create_kpi_widget(
axes[0],
'Revenue',
self.kpis['revenue']['current'],
self.kpis['revenue']['growth'],
self.kpis['revenue']['target']
)
# Customer KPIs
self._create_kpi_widget(
axes[1],
'Total Customers',
self.kpis['customers']['total'],
self.kpis['customers']['new'],
None
)
# Retention Rate
self._create_gauge_chart(
axes[2],
'Retention Rate',
self.kpis['customers']['retention_rate'],
[0, 100]
)
# Continue with other KPIs...
plt.tight_layout()
return fig
def _create_kpi_widget(self, ax, title, value, change, target=None):
"""KPI 위젯 생성"""
ax.set_xlim(0, 1)
ax.set_ylim(0, 1)
ax.axis('off')
# 제목
ax.text(0.5, 0.9, title, ha='center', fontsize=14, fontweight='bold')
# 값
ax.text(0.5, 0.6, f'{value:,.0f}', ha='center', fontsize=24, fontweight='bold')
# 변화율
color = 'green' if change >= 0 else 'red'
arrow = '▲' if change >= 0 else '▼'
ax.text(0.5, 0.4, f'{arrow} {abs(change):.1f}%',
ha='center', fontsize=16, color=color)
# 목표
if target:
progress = (value / target) * 100
ax.text(0.5, 0.2, f'Target: {progress:.1f}%',
ha='center', fontsize=12, color='gray')
```