Motivation

I’ve started this little project since I was mainly interested in the data my smartphone is sending all the time without my knowledge. I have a bunch of apps installed on my phone and I have absolutely no ideea which (kind of) data is beeing transfered to the Internet all day long. I thought I’d be a great ideea to monitor/sniff my data interface (3G, Edge etc. NOT Wifi) for 24h during my normal daily phone usage.

Sniff environment

I’ve used my Samsung Note 1 (GT N7000) as sniffing device. At the moment I use a customized ROM (slimbean) with root access. In order to be to use sniffing tools on my phone I had to work in a chrooted environment like “Debian on Android”. This way I was given access to phones data interfaces and I was ready to go.

u0_a99@android:/ $ deb
e2fsck 1.41.11 (14-Mar-2010)
/storage/sdcard1/debian-kit/debian.img: recovering journal
/storage/sdcard1/debian-kit/debian.img: clean, 55210/170752 files, 426942/512000 blocks
root@debian-on-android:/# ifconfig -a
...
rmnet0    Link encap:Point-to-Point Protocol  
      POINTOPOINT NOARP MULTICAST  MTU:1500  Metric:1
      RX packets:37490 errors:0 dropped:0 overruns:0 frame:0
      TX packets:30841 errors:0 dropped:0 overruns:0 carrier:0
      collisions:0 txqueuelen:1000 
      RX bytes:34233580 (32.6 MiB)  TX bytes:5906191 (5.6 MiB)

Initially I wanted to use tshark for the sniffing part but it didn’t work quite well. So I came back to old school tcpdump. Since my data interface was going done all the time I had to make sure that tcpdump was restarted as soon as the data interface was online again. I used the following script:

root@debian-on-android:~# cat monitor.sh 
#!/bin/bash

DATE=`date +"%Y-%m-%d-%s"`

while true; 
do 
    tcpdump -i rmnet0 -np -w output-`date +"%Y-%m-%d-%s"`.pcap; sleep 10 
done

I’ve fired up my script and after 24 hours I had these outputs:

root@debian-on-android:~# ls -l output-2014-01-1*
-rw-r--r--. 1 root root    24907 Jan 18 12:53 output-2014-01-18-1390049466.pcap
-rw-r--r--. 1 root root     2881 Jan 18 12:55 output-2014-01-18-1390049736.pcap
-rw-r--r--. 1 root root 14963016 Jan 18 14:02 output-2014-01-18-1390049777.pcap
-rw-r--r--. 1 root root 54695690 Jan 19 14:03 output-2014-01-18-1390053867.pcap
-rw-r--r--. 1 root root 12492822 Jan 19 16:27 output-2014-01-19-1390140216.pcap
root@debian-on-android:~# 

Merge pcap files

$ mergecap -F libpcap -a output-* -w merged.pcap

Convert pcap to SQLite3 DB

PCAP_FILE = "/home/victor/work/Projects/24h-Android-Monitoring/pcap/merged.pcap"

# Tshark generated files
DNS_QUERIES  = "/home/victor/work/Projects/24h-Android-Monitoring/pcap/dns_queries.csv"
CONNECTIONS  = "/home/victor/work/Projects/24h-Android-Monitoring/pcap/connections.csv"
HTTP_TRAFFIC = "/home/victor/work/Projects/24h-Android-Monitoring/pcap/http_traffic.csv"

Extract valuable information from pcap file

dns_queries = !tshark -r $PCAP_FILE  -R "dns.flags.response == 1"  -E occurrence=f -E header=y \
              -T fields  -e frame.number -e frame.time -e dns.qry.name -e dns.resp.addr > $DNS_QUERIES
    
connections = !tshark -r $PCAP_FILE -E header=y -E separator=\; -T fields -e frame.number \
              -e frame.time -e ip.src -e ip.dst -e tcp.dstport -e frame.protocols > $CONNECTIONS

http_traffic = !tshark -r $PCAP_FILE -Y "http.request" -E header=y -T fields \
              -e frame.number -e frame.time -e ip.dst -e http.request.method -e http.request.uri -e http.user_agent \
              -e http.response.code  -e http.response.phrase -e http.content_length -e data -e text > $HTTP_TRAFFIC

import sqlite3 as sql
con = sql.connect(":memory:")
cur = con.cursor()

import binascii
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pandas.io.sql as pdsql 

# Pandas settings
pd.set_option('display.height', 500)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 500)
pd.set_option('display.max_colwidth', 1000)

# Useful functions
def hex2str(s):
    if str(s) == 'nan':
        return None
    else:
        return bytes.fromhex(str(s)).decode('utf-8')
    
# Import DNS queries
dns_df = pd.read_table(DNS_QUERIES)
dns_df.columns = ['frame_number', 'frame_time', 'dns_query', 'dns_response']

# Import connections
con_df = pd.read_table(CONNECTIONS, sep=";")
con_df.columns = ['frame_number', 'frame_time', 'src', 'dst', 'dstport', 'frame_protocols']

# Import http traffic
http_df = pd.read_table(HTTP_TRAFFIC)
http_df.columns = ['frame_number', 'frame_time', 'ip_dst','request_method', 'request_uri', 'user_agent','response_code', 'response_phrase','content_length', 'data', 'text']

# Convert data (hex) to ascii
convert_data = lambda x: hex2str(x)
http_df['data'] = http_df['data'].apply(convert_data)


# Write to SQLite
pdsql.write_frame(dns_df, name="dns", con=con, if_exists="delete")
pdsql.write_frame(con_df, name="connection", con=con, if_exists="delete")
pdsql.write_frame(http_df, name="http", con=con, if_exists="delete")

dns_df.head(5)

#p2 = pdsql.read_frame("""SELECT COUNT(dns_response), dns_query FROM dns""", cnx)
#p2

# Count unique values
#unique_dns = dns_df.groupby('DNS Query')['DNS Response'].nunique().reset_index()
#unique_dns.columns = ['DNS', '# DNS Queries']
#unique_dns.sort(['# DNS Queries'], ascending=False).head(10)


Top DNS queries

p1 = pdsql.read_frame("""
    SELECT COUNT(dns_response) AS '# DNS Responses', dns_query AS 'DNS to lookup' 
    FROM dns GROUP BY dns_query 
    ORDER by 1 DESC
""", con)
#print(p1.head(100).to_string())
p1.head(100)
p_testing = pdsql.read_frame(""" 
    SELECT COUNT(c.dst),c.dst, c.dstport, c.frame_protocols FROM connection AS c 
    JOIN dns AS d ON c.dst=d.dns_response
    WHERE c.dst == "173.194.70.95" AND c.dst NOT LIKE "10.%"
    ORDER by 1 DESC
""", con)
#p_testing.head(100)

p_testing = pdsql.read_frame("""
    SELECT COUNT(*), (SELECT dns_query FROM dns WHERE dns_response=c.dst LIMIT 1) as DNS, c.dstport, c.frame_protocols FROM connection AS c
    WHERE c.dst NOT LIKE "10.%"
    GROUP by 2
    ORDER by 1 DESC
""", con)
p_testing.head(100)

Top connections

p2 = pdsql.read_frame(""" 
    SELECT COUNT(c.dst), d.dns_query, c.dstport, c.frame_protocols FROM connection AS c 
    JOIN dns AS d ON c.dst=d.dns_response
    WHERE c.frame_protocols LIKE "sll:ip:tcp:%"
    GROUP by c.dst
    ORDER by 1 DESC
""", con)
p2.head(100)

p_testing = pdsql.read_frame(""" 
    SELECT  frame_number, frame_protocols
    FROM connection
    WHERE frame_protocols LIKE 'sll:ip:tcp%xml%'
    GROUP by frame_protocols
    ORDER by 1 DESC
""", con)
#print(p1.head(100).to_string())
p_testing.head(100)

Used protocols

p_proto = pdsql.read_frame(""" 
    SELECT COUNT(frame_protocols), frame_protocols
    FROM connection
    GROUP by frame_protocols
    ORDER BY 1 DESC
""", con)
p_proto.head(100)

Used destination ports

p_ports = pdsql.read_frame(""" 
    SELECT COUNT(c.dstport), c.dstport
    FROM connection AS c
    JOIN dns AS d ON c.dst = d.dns_response
    GROUP by c.dstport
    ORDER by 1 DESC
""", con)
#print(p1.head(100).to_string())
p_ports.head(100)

HTTP Connections

HTTP Methods

p_http_methods = pdsql.read_frame(""" 
    SELECT COUNT(request_method), request_method
    FROM http
    GROUP by request_method
    ORDER by 1 DESC
""", con)
#print(p1.head(100).to_string())
p_http_methods.head(100)

User Agents

p_user_agents = pdsql.read_frame(""" 
    SELECT COUNT(user_agent), user_agent
    FROM http
    GROUP by user_agent
    ORDER by 1 DESC
""", con)
#print(p1.head(100).to_string())
p_user_agents.head(100)

GET Requests

p3 = pdsql.read_frame(""" 
    SELECT h.frame_number, h.ip_dst, d.dns_query, h.request_method, h.request_uri, h.user_agent FROM http AS  h
    JOIN dns AS d ON h.ip_dst = d.dns_response
    WHERE lower(h.request_method) == 'get' AND
        -- Filter all rubish
        (h.request_uri NOT LIKE '%.gif'  AND 
         h.request_uri NOT LIKE '%.jpg'  AND
         h.request_uri NOT LIKE '%.jpeg' AND
         h.request_uri NOT LIKE '%.png'  AND
         h.request_uri NOT LIKE '%.gif'  AND
         h.request_uri NOT LIKE '%.css'  AND 
         h.request_uri NOT LIKE '%.html' AND
         h.request_uri NOT LIKE '%.js') 
    AND
        (d.dns_query NOT LIKE '%amazon%'   AND
         d.dns_query NOT LIKE '%fahrinfo%' AND
         d.dns_query NOT LIKE '%faz.net%'  AND
         d.dns_query NOT LIKE '%heute.de%' AND
         d.dns_query NOT LIKE '%twitter%'  AND
         d.dns_query NOT LIKE '%sueddeutsche%')
    
    GROUP by h.request_uri
    ORDER by d.dns_query
""", con)
p3.head(500)


POST Requests

p3 = pdsql.read_frame(""" 
    SELECT h.frame_number, d.dns_query, h.request_uri, h.data, h.text FROM http AS  h
    JOIN dns AS d ON h.ip_dst = d.dns_response
    WHERE lower(h.request_method) == 'post'
    ORDER by h.ip_dst    
""", con)
p3.head(500)