blob: b5116748959e8db4380d23dbcc68e092a287810e [file] [log] [blame]
#!/usr/bin/python
#
# Bertrone Matteo - Polytechnic of Turin
# November 2015
#
# eBPF application that parses HTTP packets
# and extracts (and prints on screen) the URL
# contained in the GET/POST request.
#
# eBPF program http_filter is used as SOCKET_FILTER attached to eth0 interface.
# Only packets of type ip and tcp containing HTTP GET/POST are
# returned to userspace, others dropped
#
# Python script uses bcc BPF Compiler Collection by
# iovisor (https://github.com/iovisor/bcc) and prints on stdout the first
# line of the HTTP GET/POST request containing the url
from __future__ import print_function
from bcc import BPF
from sys import argv
import socket
import os
import binascii
import time
CLEANUP_N_PACKETS = 50 # cleanup every CLEANUP_N_PACKETS packets received
MAX_URL_STRING_LEN = 8192 # max url string len (usually 8K)
MAX_AGE_SECONDS = 30 # max age entry in bpf_sessions map
# print str until CR+LF
def printUntilCRLF(s):
print(s.split(b'\r\n')[0].decode())
# cleanup function
def cleanup():
# get current time in seconds
current_time = int(time.time())
# looking for leaf having:
# timestap == 0 --> update with current timestamp
# AGE > MAX_AGE_SECONDS --> delete item
for key, leaf in bpf_sessions.items():
try:
current_leaf = bpf_sessions[key]
# set timestamp if timestamp == 0
if (current_leaf.timestamp == 0):
bpf_sessions[key] = bpf_sessions.Leaf(current_time)
else:
# delete older entries
if (current_time - current_leaf.timestamp > MAX_AGE_SECONDS):
del bpf_sessions[key]
except:
print("cleanup exception.")
return
# args
def usage():
print("USAGE: %s [-i <if_name>]" % argv[0])
print("")
print("Try '%s -h' for more options." % argv[0])
exit()
# help
def help():
print("USAGE: %s [-i <if_name>]" % argv[0])
print("")
print("optional arguments:")
print(" -h print this help")
print(" -i if_name select interface if_name. Default is eth0")
print("")
print("examples:")
print(" http-parse # bind socket to eth0")
print(" http-parse -i wlan0 # bind socket to wlan0")
exit()
# arguments
interface = "eth0"
if len(argv) == 2:
if str(argv[1]) == '-h':
help()
else:
usage()
if len(argv) == 3:
if str(argv[1]) == '-i':
interface = argv[2]
else:
usage()
if len(argv) > 3:
usage()
print("binding socket to '%s'" % interface)
# initialize BPF - load source code from http-parse-complete.c
bpf = BPF(src_file="http-parse-complete.c", debug=0)
# load eBPF program http_filter of type SOCKET_FILTER into the kernel eBPF vm
# more info about eBPF program types
# http://man7.org/linux/man-pages/man2/bpf.2.html
function_http_filter = bpf.load_func("http_filter", BPF.SOCKET_FILTER)
# create raw socket, bind it to interface
# attach bpf program to socket created
BPF.attach_raw_socket(function_http_filter, interface)
# get file descriptor of the socket previously
# created inside BPF.attach_raw_socket
socket_fd = function_http_filter.sock
# create python socket object, from the file descriptor
sock = socket.fromfd(socket_fd, socket.PF_PACKET,
socket.SOCK_RAW, socket.IPPROTO_IP)
# set it as blocking socket
sock.setblocking(True)
# get pointer to bpf map of type hash
bpf_sessions = bpf.get_table("sessions")
# packets counter
packet_count = 0
# dictionary containing association
# <key(ipsrc,ipdst,portsrc,portdst),payload_string>.
# if url is not entirely contained in only one packet,
# save the firt part of it in this local dict
# when I find \r\n in a next pkt, append and print the whole url
local_dictionary = {}
while 1:
# retrieve raw packet from socket
packet_str = os.read(socket_fd, 4096) # set packet length to max packet length on the interface
packet_count += 1
# DEBUG - print raw packet in hex format
# packet_hex = binascii.hexlify(packet_str)
# print ("%s" % packet_hex)
# convert packet into bytearray
packet_bytearray = bytearray(packet_str)
# ethernet header length
ETH_HLEN = 14
# IP HEADER
# https://tools.ietf.org/html/rfc791
# 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
# +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
# |Version| IHL |Type of Service| Total Length |
# +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
#
# IHL : Internet Header Length is the length of the internet header
# value to multiply * 4 byte
# e.g. IHL = 5 ; IP Header Length = 5 * 4 byte = 20 byte
#
# Total length: This 16-bit field defines the entire packet size,
# including header and data, in bytes.
# calculate packet total length
total_length = packet_bytearray[ETH_HLEN + 2] # load MSB
total_length = total_length << 8 # shift MSB
total_length = total_length + packet_bytearray[ETH_HLEN + 3] # add LSB
# calculate ip header length
ip_header_length = packet_bytearray[ETH_HLEN] # load Byte
ip_header_length = ip_header_length & 0x0F # mask bits 0..3
ip_header_length = ip_header_length << 2 # shift to obtain length
# retrieve ip source/dest
ip_src_str = packet_str[ETH_HLEN + 12: ETH_HLEN + 16] # ip source offset 12..15
ip_dst_str = packet_str[ETH_HLEN + 16:ETH_HLEN + 20] # ip dest offset 16..19
ip_src = int(binascii.hexlify(ip_src_str), 16)
ip_dst = int(binascii.hexlify(ip_dst_str), 16)
# TCP HEADER
# https://www.rfc-editor.org/rfc/rfc793.txt
# 12 13 14 15
# 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
# +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
# | Data | |U|A|P|R|S|F| |
# | Offset| Reserved |R|C|S|S|Y|I| Window |
# | | |G|K|H|T|N|N| |
# +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
#
# Data Offset: This indicates where the data begins.
# The TCP header is an integral number of 32 bits long.
# value to multiply * 4 byte
# e.g. DataOffset = 5 ; TCP Header Length = 5 * 4 byte = 20 byte
# calculate tcp header length
tcp_header_length = packet_bytearray[ETH_HLEN + ip_header_length + 12] # load Byte
tcp_header_length = tcp_header_length & 0xF0 # mask bit 4..7
tcp_header_length = tcp_header_length >> 2 # SHR 4 ; SHL 2 -> SHR 2
# retrieve port source/dest
port_src_str = packet_str[ETH_HLEN + ip_header_length:ETH_HLEN + ip_header_length + 2]
port_dst_str = packet_str[ETH_HLEN + ip_header_length + 2:ETH_HLEN + ip_header_length + 4]
port_src = int(binascii.hexlify(port_src_str), 16)
port_dst = int(binascii.hexlify(port_dst_str), 16)
# calculate payload offset
payload_offset = ETH_HLEN + ip_header_length + tcp_header_length
# payload_string contains only packet payload
payload_string = packet_str[(payload_offset):(len(packet_bytearray))]
# CR + LF (substring to find)
crlf = b'\r\n'
# current_Key contains ip source/dest and port source/map
# useful for direct bpf_sessions map access
current_Key = bpf_sessions.Key(ip_src, ip_dst, port_src, port_dst)
# looking for HTTP GET/POST request
if ((payload_string[:3] == b'GET') or (payload_string[:4] == b'POST')
or (payload_string[:4] == b'HTTP') or (payload_string[:3] == b'PUT')
or (payload_string[:6] == b'DELETE') or (payload_string[:4] == b'HEAD')):
# match: HTTP GET/POST packet found
if (crlf in payload_string):
# url entirely contained in first packet -> print it all
printUntilCRLF(payload_string)
# delete current_Key from bpf_sessions, url already printed.
# current session not useful anymore
try:
del bpf_sessions[current_Key]
except:
print("error during delete from bpf map ")
else:
# url NOT entirely contained in first packet
# not found \r\n in payload.
# save current part of the payload_string in dictionary
# <key(ips,ipd,ports,portd),payload_string>
local_dictionary[binascii.hexlify(current_Key)] = payload_string
else:
# NO match: HTTP GET/POST NOT found
# check if the packet belong to a session saved in bpf_sessions
if (current_Key in bpf_sessions):
# check id the packet belong to a session saved in local_dictionary
# (local_dictionary maintains HTTP GET/POST url not
# printed yet because split in N packets)
if (binascii.hexlify(current_Key) in local_dictionary):
# first part of the HTTP GET/POST url is already present in
# local dictionary (prev_payload_string)
prev_payload_string = local_dictionary[binascii.hexlify(current_Key)]
# looking for CR+LF in current packet.
if (crlf in payload_string):
# last packet. containing last part of HTTP GET/POST
# url split in N packets. Append current payload
prev_payload_string += payload_string
# print HTTP GET/POST url
printUntilCRLF(prev_payload_string)
# clean bpf_sessions & local_dictionary
try:
del bpf_sessions[current_Key]
del local_dictionary[binascii.hexlify(current_Key)]
except:
print("error deleting from map or dictionary")
else:
# NOT last packet. Containing part of HTTP GET/POST url
# split in N packets.
# Append current payload
prev_payload_string += payload_string
# check if not size exceeding
# (usually HTTP GET/POST url < 8K )
if (len(prev_payload_string) > MAX_URL_STRING_LEN):
print("url too long")
try:
del bpf_sessions[current_Key]
del local_dictionary[binascii.hexlify(current_Key)]
except:
print("error deleting from map or dict")
# update dictionary
local_dictionary[binascii.hexlify(current_Key)] = prev_payload_string
else:
# first part of the HTTP GET/POST url is
# NOT present in local dictionary
# bpf_sessions contains invalid entry -> delete it
try:
del bpf_sessions[current_Key]
except:
print("error del bpf_session")
# check if dirty entry are present in bpf_sessions
if (((packet_count) % CLEANUP_N_PACKETS) == 0):
cleanup()