# numpy and pandas for data manipulation
# model used for feature importances
import lightgbm as lgb
# visualizations
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
# utility for early stopping with a validation set
from sklearn.model_selection import train_test_split
plt.switch_backend('agg')
import seaborn as sns
# memory management
import gc
# utilities
from itertools import chain
class FeatureSelector():
"""
Class for performing feature selection for machine learning or data preprocessing.
Implements five different methods to identify features for removal
1. Find columns with a missing percentage greater than a specified threshold
2. Find columns with a single unique value
3. Find collinear variables with a correlation greater than a specified correlation coefficient
4. Find features with 0.0 feature importance from a gradient boosting machine (gbm)
5. Find low importance features that do not contribute to a specified cumulative feature importance from the gbm
Parameters
--------
data : dataframe
A dataset with observations in the rows and features in the columns
labels : array or series, default = None
Array of labels for training the machine learning model to find feature importances. These can be either binary labels
(if task is 'classification') or continuous targets (if task is 'regression').
If no labels are provided, then the feature importance based methods are not available.
Attributes
--------
ops : dict
Dictionary of operations run and features identified for removal
missing_stats : dataframe
The fraction of missing values for all features
record_missing : dataframe
The fraction of missing values for features with missing fraction above threshold
unique_stats : dataframe
Number of unique values for all features
record_single_unique : dataframe
Records the features that have a single unique value
corr_matrix : dataframe
All correlations between all features in the data
record_collinear : dataframe
Records the pairs of collinear variables with a correlation coefficient above the threshold
feature_importances : dataframe
All feature importances from the gradient boosting machine
record_zero_importance : dataframe
Records the zero importance features in the data according to the gbm
record_low_importance : dataframe
Records the lowest importance features not needed to reach the threshold of cumulative importance according to the gbm
Notes
--------
- All 5 operations can be run with the `identify_all` method.
- If using feature importances, one-hot encoding is used for categorical variables which creates new columns
"""
def __init__(self, data, labels=None):
# Dataset and optional training labels
self.data = data
self.labels = labels
if labels is None:
print('No labels provided. Feature importance based methods are not available.')
self.base_features = list(data.columns)
self.one_hot_features = None
# Dataframes recording information about features to remove
self.record_missing = None
self.record_single_unique = None
self.record_collinear = None
self.record_zero_importance = None
self.record_low_importance = None
self.missing_stats = None
self.unique_stats = None
self.corr_matrix = None
self.feature_importances = None
# Dictionary to hold removal operations
self.ops = {}
self.one_hot_correlated = False
def identify_missing(self, missing_threshold):
"""Find the features with a fraction of missing values above `missing_threshold`"""
self.missing_threshold = missing_threshold
# Calculate the fraction of missing in each column
missing_series = self.data.isnull().sum() / self.data.shape[0]
self.missing_stats = pd.DataFrame(missing_series).rename(columns = {'index': 'feature', 0: 'missing_fraction'})
# Sort with highest number of missing values on top
self.missing_stats = self.missing_stats.sort_values('missing_fraction', ascending = False)
# Find the columns with a missing percentage above the threshold
record_missing = pd.DataFrame(missing_series[missing_series > missing_threshold]).reset_index().rename(columns =
{'index': 'feature',
0: 'missing_fraction'})
to_drop = list(record_missing['feature'])
self.record_missing = record_missing
self.ops['missing'] = to_drop
print('%d features with greater than %0.2f missing values.\n' % (len(self.ops['missing']), self.missing_threshold))
def identify_single_unique(self):
"""Finds features with only a single unique value. NaNs do not count as a unique value. """
# Calculate the unique counts in each column
unique_counts = self.data.nunique()
self.unique_stats = pd.DataFrame(unique_counts).rename(columns = {'index': 'feature', 0: 'nunique'})
self.unique_stats = self.unique_stats.sort_values('nunique', ascending = True)
# Find the columns with only one unique count
record_single_unique = pd.DataFrame(unique_counts[unique_counts == 1]).reset_index().rename(columns = {'index': 'feature',
0: 'nunique'})
to_drop = list(record_single_unique['feature'])
self.record_single_unique = record_single_unique
self.ops['single_unique'] = to_drop
print('%d features with a single unique value.\n' % len(self.ops['single_unique']))
def identify_collinear(self, correlation_threshold, one_hot=False):
"""
Finds collinear features based on the correlation coefficient between features.
For each pair of features with a correlation coefficient greather than `correlation_threshold`,
only one of the pair is identified for removal.
Using code adapted from: https://chrisalbon.com/machine_learning/feature_selection/drop_highly_correlated_features/
Parameters
--------
correlation_threshold : float between 0 and 1
Value of the Pearson correlation cofficient for identifying correlation features
one_hot : boolean, default = False
Whether to one-hot encode the features before calculating the correlation coefficients
"""
self.correlation_threshold = correlation_threshold
self.one_hot_correlated = one_hot
# Calculate the correlations between every column
if one_hot:
# One hot encoding
features = pd.get_dummies(self.data)
self.one_hot_features = [column for column in features.columns if column not in self.base_features]
# Add one hot encoded data to original data
self.data_all = pd.concat([features[self.one_hot_features], self.data], axis = 1)
corr_matrix = pd.get_dummies(features).corr()
else:
corr_matrix = self.data.corr()
65579-Python 金融大数据风控建模实战_源代码.zip
需积分: 0 183 浏览量
更新于2023-02-07
1
收藏 17.56MB ZIP 举报
在本资源"65579-Python 金融大数据风控建模实战_源代码.zip"中,我们可以深入探讨Python在金融大数据风控建模中的应用。这个压缩包包含的"code"文件夹很可能是实现相关算法和分析的源代码,这为我们提供了一个实战性的学习平台。以下将对这一主题进行详细的阐述。
一、Python在金融风控中的作用
Python因其语法简洁、库丰富以及强大的数据处理能力,已成为金融领域进行大数据分析和风险控制的首选语言。在风控建模中,Python可以用于以下几个关键步骤:
1. 数据获取:Python有许多库如`pandas`、`requests`、`BeautifulSoup`等,能够方便地从各种数据源(如API、数据库、网页抓取)获取和清洗数据。
2. 数据预处理:使用`pandas`进行数据清洗、缺失值处理、异常值检测、特征工程等操作,为模型构建提供高质量的数据。
3. 数据分析:通过`matplotlib`、`seaborn`等库进行数据可视化,帮助理解数据分布和潜在关联;`numpy`和`scipy`提供统计计算功能。
4. 建模与预测:Python有众多机器学习库,如`sklearn`、`tensorflow`、`keras`等,支持各种风险评估模型,如逻辑回归、决策树、随机森林、支持向量机、神经网络等。
5. 模型评估:利用`sklearn`等库的评估指标,如准确率、召回率、F1分数、AUC-ROC曲线等,对模型性能进行评估。
二、金融大数据风控建模流程
1. 问题定义:明确要解决的风险问题,如贷款违约预测、欺诈检测等。
2. 数据收集:从银行交易记录、信用报告、社交媒体等多个来源收集数据。
3. 数据预处理:处理缺失值、异常值,进行数据转换和标准化。
4. 特征工程:创建新的有意义的特征,可能包括用户的消费行为、还款历史、信用评分等。
5. 数据探索:通过可视化和统计分析,了解数据的分布和特征间的关系。
6. 模型选择:根据问题类型选择合适的预测模型,如分类模型(如逻辑回归、随机森林)或异常检测模型(如Isolation Forest)。
7. 训练与调优:使用训练集训练模型,通过交叉验证优化模型参数。
8. 验证与评估:在验证集上评估模型性能,确保模型泛化能力强。
9. 模型部署:将训练好的模型部署到生产环境,实时监控风险。
三、Python库在风控中的应用实例
1. `pandas`:用于数据清洗和处理,例如`fillna()`填充缺失值,`dropna()`移除含有缺失值的行,`groupby()`进行分组分析。
2. `matplotlib`和`seaborn`:用于绘制箱线图、直方图、散点图等,帮助识别异常值和趋势。
3. `sklearn`:提供了多种机器学习算法,如`LogisticRegression`用于二分类问题,`RandomForestClassifier`用于集成学习,`GridSearchCV`用于超参数调优。
4. `scikit-learn-contrib`:包含`imblearn`库,用于处理不平衡数据集,提高模型的公平性。
5. `xgboost`或`lightgbm`:用于构建更高效、更准确的梯度提升模型。
6. `tensorflow`和`keras`:在深度学习场景下,构建神经网络模型进行风险预测。
通过学习并实践这个源代码,读者不仅可以了解Python在金融风控中的实际应用,还能掌握从数据预处理到模型构建、评估的全过程,提升自己的数据分析和风险管理能力。

duomeimie
- 粉丝: 0
- 资源: 1
最新资源
- 基于思维链的大规模语言模型知识编辑方法研究与应用
- 现代工程设计中的仿真技术及其应用全流程解析
- 区间预测QRCNN-BiLSTM基于分位数回归双向长短期记忆神经网络的多变量回归区间预测 Matlab语言 程序已调试好,无需更改代码直接替Excel运行你先用,你就是创新需要水文的抓紧 多变量单输出
- 大型语言模型自我修正能力的研究与改进方法
- kvm-client-windows.zip
- MySQL安装配置详解,助你轻松掌握数据库搭建与优化技巧
- 威纶通触摸屏-宏指令版配方功能 威纶通触摸屏配方宏指令程序,MT8071iE触摸屏 具有新增配方功能 具有修改配方功能 具有删除配方功能 具有查找配方功能 利用宏指令程序来控制,可宏指令注释清晰,方
- 游戏环境中大型语言模型推理能力评估的新基准GAMEBOT的设计与应用
- 大型语言模型比较数据集.zip
- 复旦大学计算机科学技术学院《多媒体技术基础》课程资料
- 自然语言处理中的DNDSCORE方法:长文本生成的事实验证新策略
- 毕业论文+国家开放大学+计算机科学与技术
- 日志必备Linux工具 InfoCollect-Linux.zip
- 艾滋病试验组数据集.zip
- 基于优化算法结合的永磁同步电机速度 LADRC控制器优化源码 1、优化算法有粒子群算法、天牛须算法、改进的天牛群算法和灰狼算法,均可完成控制器参数寻优 2、模型基于代码和simulink共同实现,包
- 天津大学的专业基础测试选题列表