[Bro] Bro Azure HDInsight HIVE table scripts and Azure python uplod

Brandon Glaze bglaze at gmail.com
Fri Jul 8 14:35:14 PDT 2016


Good day everyone,

I wanted to share what I have done with regards to uploading my bro
clusters logs to Azure HDInsight Blob storage, then each day creating HIVE
tables from the logs. My bro cluster averages around 55-60Gbps, so sorting
through logs via zgrep, or even Elastic, is far from ideal. I found that
looking for a specific file ID took around 30 minutes to search all logs
for the day, but when using HDInsight it was under a minute.

Now I am sure there are some big data scientists on this forum, so forgive
my newbness on Hadoop (HDInsight is Hortonworks Hadoop), as well as a basic
python programmer, so that script is simple.

It is my hope that this work helps someone else, or at least gets them
started. I will see if I can sanitize my bro cluster build documents and
send that out in the hope that it also helps.

Here we go:

First I modify each bro log to be Hadoop name convention friendly (doesnt
like colons), then each hour I upload my log files with the below python
script:

=================
/etc/BRO-Azure-Upload.sh

#!/bin/bash



# Script to run each hour, which modifies the file names to be Azure Hadoop
friendly, then calls a Python script to upload

# each file to Azure blob storage



DIR=/DATA/bro-logs


DATE=`date +%Y-%m-%d "--date=last hour"`

TIME=`date +%H "--date=last hour"`



# Change to working directory

cd $DIR/$DATE



# Remove all ":" and replace with "_" in file names

ls -1 *.`date +%H "--date=last hour"`* | while read FILE ; do
newfile="$(echo ${FILE} | sed -e 's/\:/\_/g')" ; mv "${FILE}" "${newfile}"
; done



# Add the building name to each file by finding the _([[:digit:]]).log.gz
files, in case the first buildings logs have already been added

ls -1 *.`date +%H "--date=last hour"`_*_[0-9][0-9].log.gz | while read FILE
; do newfile="$(echo ${FILE} | sed -e 's/.log.gz/.B31.log.gz/')" ; mv
"${FILE}" "${newfile}" ; done



# Call python script to upload last hours logs:



for LOG in $(ls -1 *.`date +%H "--date=last hour"`_*.B01.log.gz ) ; do



/DATA/Azure-Blob-PROMPT.py $LOG $DATE ;



done

*Here is the Python script I use to upload the logs:*

=======================

Azure-Blob-PROMPT.py

=======================



REQUIRES THAT YOU INSTALL PYTHON 2.7 on CentOS/RHEL



#!/usr/local/python2.7/bin/python2.7

import sys

import azure.storage

from azure.storage.blob import BlockBlobService



FILE = sys.argv[1]

DIR = sys.argv[2]

storage_account_name = '<NAME>' # storage key in file in parent directory
called <storage_account_name>.storagekey

storage_account_key = '<KEY>'



block_blob_service = BlockBlobService(account_name=storage_account_name,
account_key=storage_account_key)



storage_container_name = '<CONTAINER>'

block_blob_service.create_container(storage_container_name)

#storage_dir = (storage_container_name,'/', DIR)

storage_dir = storage_container_name+"/"+DIR



from azure.storage.blob import ContentSettings

block_blob_service.create_blob_from_path(

    storage_dir,

    FILE,

    FILE,


content_settings=ContentSettings(content_type='application/octet-stream'),

    progress_callback=None,

    max_connections=5,

    max_retries=5,

    retry_wait=2

           )




==============
Here are the commands I use to create the HIVE tables. I create a directory
with the date via a cron job, then make a table with the entire days logs
in it.


=============
SSL:
=================
DROP TABLE IF EXISTS SSLDAILY;

CREATE EXTERNAL TABLE SSLDAILY (ts TIMESTAMP, uid STRING, orig_h STRING,
orig_p STRING, resp_h STRING, resp_p STRING, version STRING, cipher STRING,

curve STRING, server_name STRING, resumed STRING, last_alert STRING,
next_protocol STRING, established STRING, cert_chain_fuids STRING,

client_cert_chain_fuids STRING, subject STRING, issuer STRING,
client_subject STRING, client_issuer STRING)

ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' LINES TERMINATED BY '\n'

LOAD DATA INPATH
'wasb://<CONTAINER>@<STORAGE_ACCT>.blob.core.windows.net/<DIR>/ssl.*'
INTO TABLE SSLDAILY;

=================
CONN:
=================
DROP TABLE IF EXISTS CONNDAILY;

CREATE EXTERNAL TABLE CONNDAILY (ts TIMESTAMP, uid STRING, orig_h STRING,
orig_p STRING, resp_h STRING, resp_p STRING, proto STRING, service STRING,

duration DECIMAL, orig_bytes INT, resp_bytes INT, conn_state STRING,
local_orig STRING, local_resp STRING, missed_bytes INT, history STRING,
orig_pkts

INT, orig_ip_bytes INT, resp_pkts INT, resp_ip_bytes INT, tunnel_parents
STRING)

ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' LINES TERMINATED BY '\n';

LOAD DATA INPATH
'wasb://<CONTAINER>@<STORAGE_ACCT>.blob.core.windows.net/<DIR>/conn.*'
INTO TABLE CONNDAILY;

==================
DNS:
==================
DROP TABLE IF EXISTS DNSDAILY;

CREATE EXTERNAL TABLE DNSDAILY (ts TIMESTAMP, uid STRING, orig_h STRING,
orig_p STRING, resp_h STRING, resp_p STRING, proto STRING, trans_id STRING,

query STRING, qclass STRING, qclass_name STRING, qtype STRING, qtype_name
STRING, rcode STRING, rcode_name STRING, AA STRING, TC STRING, RD STRING,
RA

STRING, Z STRING, answers STRING, TTLs STRING, rejected STRING)

ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' LINES TERMINATED BY '\n';

LOAD DATA INPATH
'wasb://<CONTAINER>@<STORAGE_ACCT>.blob.core.windows.net/<DIR>/dns.*'
INTO TABLE DNSDAILY;

==================================
DPD (Dynamic Protocol Detection):
==================================
DROP TABLE IF EXISTS DPDDAILY;

CREATE EXTERNAL TABLE DPDDAILY (ts TIMESTAMP, uid STRING, orig_h STRING,
orig_p STRING, resp_h STRING, resp_p STRING, proto STRING, analyzer STRING,

failure_reason STRING)

ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' LINES TERMINATED BY '\n';

LOAD DATA INPATH
'wasb://<CONTAINER>@<STORAGE_ACCT>.blob.core.windows.net/<DIR>/dpd.*'
INTO TABLE DPDDAILY;

==================
FILES:
==================
DROP TABLE IF EXISTS FILESDAILY;

CREATE EXTERNAL TABLE FILESDAILY (ts TIMESTAMP, fuid STRING, tx_hosts
STRING, rx_hosts STRING, conn_uids STRING, source STRING, depth STRING,
analyzers

STRING, mime_type STRING, filename STRING, duration DECIMAL, local_orig
STRING, is_orig STRING, seen_bytes INT, total_bytes INT, missing_bytes INT,

overflow_bytes INT, timedout STRING, parent_fuid STRING, md5 STRING, sha1
STRING, sha256 STRING, extracted STRING)

ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' LINES TERMINATED BY '\n';

LOAD DATA INPATH
'wasb://<CONTAINER>@<STORAGE_ACCT>.blob.core.windows.net/<DIR>/files.*'
INTO TABLE FILESDAILY;

==================
FTP:
==================
DROP TABLE IF EXISTS FTPDAILY;

CREATE EXTERNAL TABLE FTPDAILY (ts TIMESTAMP, uid STRING, orig_h STRING,
orig_p STRING, resp_h STRING, resp_p STRING, user STRING, password STRING,

command STRING, arg STRING, mime_type STRING, file_size STRING, reply_code
STRING, reply_msg STRING, data_channel_passive STRING, data_channel_orig_h

STRING, data_channel_resp_h STRING, data_channel_resp_p STRING, fuid STRING)

ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' LINES TERMINATED BY '\n';

LOAD DATA INPATH
'wasb://<CONTAINER>@<STORAGE_ACCT>.blob.core.windows.net/<DIR>/ftp.*'
INTO TABLE FTPDAILY;

==================
HTTP:
==================
DROP TABLE IF EXISTS HTTPDAILY;

CREATE EXTERNAL TABLE HTTPDAILY (ts TIMESTAMP, uid STRING, orig_h STRING,
orig_p STRING, resp_h STRING, resp_p STRING, trans_depth STRING, method

STRING, host STRING, uri STRING, referrer STRING, user_agent STRING,
request_body_len INT, status_code STRING, status_msg STRING, info_code
STRING,

info_msg STRING, filename STRING, tags STRING, username STRING, password
STRING, proxied STRING, orig_fuids STRING, orig_mime_types STRING,
resp_fuids

STRING, resp_mime_types STRING)

ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' LINES TERMINATED BY '\n';

LOAD DATA INPATH
'wasb://<CONTAINER>@<STORAGE_ACCT>.blob.core.windows.net/<DIR>/http.*'
INTO TABLE HTTPDAILY;

==================
INTEL:
==================
DROP TABLE IF EXISTS INTELDAILY;

CREATE EXTERNAL TABLE INTELDAILY (ts TIMESTAMP, uid STRING, orig_h STRING,
orig_p STRING, resp_h STRING, resp_p STRING, fuid STRING, file_mime_type

STRING, file_desc STRING, seen_indicator STRING, seen_indicator_type
STRING, seen_where STRING, seen_node STRING, sources STRING)

ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' LINES TERMINATED BY '\n';

LOAD DATA INPATH
'wasb://<CONTAINER>@<STORAGE_ACCT>.blob.core.windows.net/<DIR>/intel.*'
INTO TABLE INTELDAILY;

==================
IRC:
==================
DROP TABLE IF EXISTS IRCDAILY;

CREATE EXTERNAL TABLE IRCDAILY (ts TIMESTAMP, uid STRING, orig_h STRING,
orig_p STRING, resp_h STRING, resp_p STRING, nick STRING, user STRING,
command

STRING, value STRING, addl STRING, dcc_file_name STRING, dcc_file_size INT,
dcc_mime_type STRING, fuid STRING)

ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' LINES TERMINATED BY '\n';

LOAD DATA INPATH
'wasb://<CONTAINER>@<STORAGE_ACCT>.blob.core.windows.net/<DIR>/irc.*'
INTO TABLE IRCDAILY;

==================
KERBEROS:
==================
DROP TABLE IF EXISTS KERBEROSDAILY;

CREATE EXTERNAL TABLE KERBEROSDAILY (ts TIMESTAMP, uid STRING, orig_h
STRING, orig_p STRING, resp_h STRING, resp_p STRING, request_type STRING,
client

STRING, service STRING, success STRING, error_msg STRING, from STRING, till
STRING, cipher STRING, forwardable STRING, renewable STRING,

client_cert_subject STRING, client_cert_fuid STRING, server_cert_subject
STRING, server_cert_fuid STRING)

ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' LINES TERMINATED BY '\n';

LOAD DATA INPATH
'wasb://<CONTAINER>@<STORAGE_ACCT>.blob.core.windows.net/<DIR>/kerberos.*'
INTO TABLE KERBEROSDAILY;


==================
INTEL:
==================
DROP TABLE IF EXISTS INTELDAILY;

CREATE EXTERNAL TABLE INTELDAILY (ts TIMESTAMP, uid STRING, orig_h STRING,
orig_p STRING, resp_h STRING, resp_p STRING, fuid STRING, file_mime_type

STRING, file_desc STRING, seen_indicator STRING, seen_indicator_type
STRING, seen_where STRING, seen_node STRING, sources STRING)

ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' LINES TERMINATED BY '\n';

LOAD DATA INPATH
'wasb://<CONTAINER>@<STORAGE_ACCT>.blob.core.windows.net/<DIR>/intel.*'
INTO TABLE INTELDAILY;


I welcome any feedback or questions.



=====================
Brandon Glaze
bglaze at gmail.com

"Lead me, follow me, or get the hell out of my way."
- General George Patton Jr
-------------- next part --------------
An HTML attachment was scrubbed...
URL: http://mailman.ICSI.Berkeley.EDU/pipermail/bro/attachments/20160708/84115484/attachment-0001.html 


More information about the Bro mailing list