Keras 构建DNN 对用户名检测判断是否为非法用户名(从数据预处理到模型在线预测)

一.  数据集的准备与预处理

1 . 收集dataset

(大量用户名–包含正常用户名与非法用户名)

包含两个txt文件  legal_name.txt  ilegal_name.txt. 如下图所示

2. 用文件进行预处理

# Data sets
import os
import pandas as pd

DATAPATH = "../dataset"

POS = os.path.join(DATAPATH, "legal_name.txt")
POS_OUTPUT = os.path.join(DATAPATH, "legal_name.csv")

NEG = os.path.join(DATAPATH, "ilegal_name.txt")
NEG_OUTPUT = os.path.join(DATAPATH, "ilegal_name.csv")


def process_org_data(input_data, output_data, lable):
    reader = pd.read_csv(input_data, iterator=True)
    while True:
        try:
            train = reader.get_chunk(10000)
            train['username'] = train['username'].astype(str)
            train['username'] = map(lambda x: x.strip(), train['username'])
            train['length'] = train['username'].apply(len)
           
            ... ...

            train['label'] = map(lambda x: lable, train['username'])
            train.to_csv(output_data, encoding='utf-8', mode='a', index=False, header=False)

        except StopIteration:
            print "Iteration is stopped."
            break


if __name__ == '__main__':
    process_org_data(POS, POS_OUTPUT, 1)
    process_org_data(NEG, NEG_OUTPUT, 0)

根据需求提取相应的特征, 输出成 csv 格式,包含特征列与label列

把合法用户dataset与非法用户dataset,合并打乱,切割成 train.csv 和 test.csv

 

pos_dataset = read_dataset(POS)
neg_dataset = read_dataset(NEG)
dataset = pd.concat([pos_dataset, neg_dataset])
dataset = dataset.sample(frac=1).reset_index(drop=True)

train_data = dataset.loc[:200000, :]
test_data = dataset.loc[200000:, :]


train_data.to_csv(os.path.join(DataPath, "train.csv"), index=False)
test_data.to_csv(os.path.join(DataPath, "test.csv"), index=False)

 

 

二.  Keras 构建DNN模型进行训练与模型保存

 

import pandas as pd
import os
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import numpy as np

tf.logging.set_verbosity(tf.logging.INFO)

DataPath = "../dataset"

TRAIN = os.path.join(DataPath, "train.csv")
TEST = os.path.join(DataPath, "test.csv")

COLUMNS = ["username", ... , "label"]

train_dataset = pd.read_csv(TRAIN, skipinitialspace=True, skiprows=1, names=COLUMNS)
test_dataset = pd.read_csv(TEST, skipinitialspace=True, skiprows=1, names=COLUMNS)

for col in train_dataset.columns[1:]:
    train_dataset[col] = pd.to_numeric(train_dataset[col], errors='coerce')

for col in test_dataset.columns[1:]:
    test_dataset[col] = pd.to_numeric(test_dataset[col], errors='coerce')

X_train = train_dataset.iloc[:, range(1, 19)].values
y_train = train_dataset.iloc[:, 19].values

X_test = test_dataset.iloc[:, range(1, 19)].values
y_test = test_dataset.iloc[:, 19].values


def build_model():
    ############
    model = tf.keras.models.Sequential()
    model.add(tf.keras.layers.Dense(64, input_dim=18))
    # model.add(tf.keras.layers.BatchNormalization())
    model.add(tf.keras.layers.Activation('relu'))

    model.add(tf.keras.layers.Dense(32))
    # model.add(tf.keras.layers.BatchNormalization())
    model.add(tf.keras.layers.Activation('relu'))

    model.add(tf.keras.layers.Dense(16))
    # model.add(tf.keras.layers.BatchNormalization())
    model.add(tf.keras.layers.Activation('relu'))

    model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
    return 



if __name__ == '__main__':
    model_file = './my_model.h5'
    if (os.path.isfile(model_file)):
        print('model file detected. Loading.')
        model = tf.keras.models.load_model(model_file)
    else:
        print('No model file detected.  Starting from scratch.')
        model = build_model()
        model.compile(loss='binary_crossentropy', optimizer="adam", metrics=['accuracy'])
        model.fit(X_train, y_train, batch_size=100, epochs=1, validation_data=(X_test, y_test))

模型输出为  my_model.h5 , 准确率百分之90%

 

三. 导出tensorflow-serving 模型, 运行在线预测服务

def save_model_for_production(model, version, path='prod_models'):
    tf.keras.backend.set_learning_phase(1)
    if not os.path.exists(path):
        os.mkdir(path)
    export_path = os.path.join(
        tf.compat.as_bytes(path),
        tf.compat.as_bytes(version))
    builder = tf.saved_model.builder.SavedModelBuilder(export_path)

    model_input = tf.saved_model.utils.build_tensor_info(model.input)
    model_output = tf.saved_model.utils.build_tensor_info(model.output)

    prediction_signature = (
        tf.saved_model.signature_def_utils.build_signature_def(
            inputs={'inputs': model_input},
            outputs={'output': model_output},
            method_name=tf.saved_model.signature_constants.PREDICT_METHOD_NAME))

    with tf.keras.backend.get_session() as sess:
        builder.add_meta_graph_and_variables(
            sess=sess, tags=[tf.saved_model.tag_constants.SERVING],
            signature_def_map={
                'predict':
                    prediction_signature,
            })

        builder.save()

 

导出为 tensorflow serving 模型

export_path = "tf-model"
save_model_for_production(model, "7", export_path)

运行在线预测服务(tensorflow 官方方法)

/serving/bazel-bin/tensorflow_serving/model_servers/tensorflow_model_server --port=9000 --model_name=username --model_base_path=/data/model/tf-model

四. client通过grpc 调用预测服务

#!/usr/bin/env python  
# encoding: utf-8  

""" 
@version: v1.0 
@author: zwqjoy 
@contact: zwqjoy@163.com 
@site: https://blog.csdn.net/zwqjoy 
@file: client
@time: 2018/6/29 15:02 
"""

from grpc.beta import implementations
import tensorflow as tf

from tensorflow_serving.apis import predict_pb2
from tensorflow_serving.apis import prediction_service_pb2
import numpy as np

tf.app.flags.DEFINE_string('server', '172.xxx.xxx.xxx:9000',
                           'PredictionService host:port')
tf.app.flags.DEFINE_string('username', 'demo_user',
                           '传入一个username')
FLAGS = tf.app.flags.FLAGS


def nametovec(username):
    username = username.astype(str)

    length = len(username)
    
    ... ...

    return np.array([length, ...])


def main(_):
    host, port = FLAGS.server.split(':')
    channel = implementations.insecure_channel(host, int(port))
    stub = prediction_service_pb2.beta_create_PredictionService_stub(channel)
    # Send request

    # See prediction_service.proto for gRPC request/response details.
    data = nametovec(FLAGS.username)
    data = data.astype(np.float32)

    request = predict_pb2.PredictRequest()
    request.model_spec.name = 'username'  # 这个name跟tensorflow_model_server  --model_name="username" 对应
    request.model_spec.signature_name = 'predict'  # 这个signature_name  跟signature_def_map 对应
    request.inputs['inputs'].CopyFrom(
        tf.contrib.util.make_tensor_proto(data, shape=(1, 18)))
    result = stub.Predict(request, 10.0)  # 10 secs timeout
    print(result)


if __name__ == '__main__':
    tf.app.run()

 

Categories AI