Suppose you get a list of some URLs and you are asked to “investigate” them. The list is full of some random URLs related to your company and nobody knows about. You don’t have a clue who is responsible for them nor which applications (if any) are running behind them. Sounds like a cool task, ugh?

Well in today’s post I’ll show you how I’ve managed it to minimize the process of analyzing each URL manually and saved me a lot of time automatizing things.

Setup environment

%pylab inline
# <!-- collapse=True -->
import binascii
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
import datetime as dt
import time
import ipy_table
import dnslib
import pythonwhois
import urlparse
import tldextract
import json
import os
import sys
import urllib2


from yurl import URL
from urlparse import urlparse
from IPython.display import display_pretty, display_html, display_jpeg, display_png, display_json, display_latex, display_svg

# Ipython settings
pd.set_option('display.height', 1000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 100)
pd.set_option('display.width', 3000)
pd.set_option('display.column_space', 1000)

# Change working directory
os.chdir("/root/work/appsurvey")
Populating the interactive namespace from numpy and matplotlib
height has been deprecated.

First I’ll read the list of targets from some CSV file.

# Fetch list of random URLs (found using Google)
response = urllib2.urlopen('http://files.ianonavy.com/urls.txt')
targets_row = response.read()

# Create DataFrame
targets = pd.DataFrame([t for t in targets_row.splitlines()], columns=["Target"])
print("First 20 entries in the targets list")
targets[:20]
First 20 entries in the targets list

Now I’ll split the URLs in several parts:

# <!-- collapse=True -->
# Join root domain + suffix
extract_root_domain =  lambda x: '.'.join(tldextract.extract(x)[1:3])

target_columns = ['scheme', 'userinfo', 'host', 'port', 'path', 'query', 'fragment', 'decoded']
target_component = [list(URL(t)) for t in targets['Target']]

df_targets = pd.DataFrame(target_component, columns=target_columns)
empty_hosts = df_targets[df_targets['host'] == '']

# Copy path information to host
for index,row in empty_hosts.iterrows():
    df_targets.ix[index:index]['host'] = df_targets.ix[index:index]['path']
    df_targets.ix[index:index]['path'] = ''
    
# Extract root tld
df_targets['root_domain'] = df_targets['host'].apply(extract_root_domain)

# Drop unnecessary columns
df_targets.drop(['query', 'fragment', 'decoded'], axis=1, inplace=True)

# Write df to file (for later use)
df_targets.to_csv("targets_df.csv", sep="\t")

print("First 20 Entries")
df_targets[:20]
First 20 Entries

Whois

Now get WHOIS information based on data in df_targets:

%%bash
if [ ! -d "WHOIS" ]; then
    mkdir WHOIS
fi
# Get unique values
uniq_roots = df_targets['root_domain'].unique()
uniq_subdomains = df_targets['host'].unique()
# <!-- collapse=True -->

def date_handler(obj):
    return obj.isoformat() if hasattr(obj, 'isoformat') else obj

target_whois = {}

def fetch_whois(domains):
    """ Fetch WHOIS information for specified domains (list) """
    for d in domains:
        print("Get WHOIS for\t %s ..." % d)

        # Check if file already exists
        if os.path.isfile("WHOIS/%s.json" % d):
            print("File exists already. Aborting.")
            continue

        try:
            # Get whois information
            whois_data = pythonwhois.get_whois(d)

            # Convert to JSON$
            json_data = json.dumps(whois_data, default=date_handler)

            # Write contents to file
            with open('WHOIS/%s.json' % d, 'w') as outfile:
              json.dump(json_data, outfile)

            # Sleep for 20s    
            time.sleep(20)
        except:
            print("[ERROR] Couldn't retrieve WHOIS for\t %s" % d)
            
# I'll only fetch the root domains and only the first 20. Feel free to uncomment this
# and adapt it to your needs.
#fetch_whois(uniq_subdomains)
fetch_whois(uniq_roots[:20])
    
Get WHOIS for	 altpress.org ...
Get WHOIS for	 nzfortress.co.nz ...
Get WHOIS for	 evillasforsale.com ...
Get WHOIS for	 playingenemy.com ...
Get WHOIS for	 richardsonscharts.com ...
Get WHOIS for	 xenith.net ...
Get WHOIS for	 tdbrecords.com ...
Get WHOIS for	 electrichumanproject.com ...
Get WHOIS for	 tweekerchick.blogspot.com ...
Get WHOIS for	 besound.com ...
Get WHOIS for	 porkchopscreenprinting.com ...
Get WHOIS for	 kinseyvisual.com ...
Get WHOIS for	 rathergood.com ...
Get WHOIS for	 lepoint.fr ...
Get WHOIS for	 revhq.com ...
Get WHOIS for	 poprocksandcoke.com ...
Get WHOIS for	 samuraiblue.com ...
Get WHOIS for	 openbsd.org ...
Get WHOIS for	 sysblog.com ...
Get WHOIS for	 voicesofsafety.com ...

Get all DNS records

%%bash
if [ ! -d "DNS" ]; then
    mkdir DNS
fi
# <!-- collapse=True -->
def fetch_dns(domains):
    """ Fetch all DNS records for specified domains (list) """
    for d in domains:
        print("Dig DNS records for\t %s ..." % d)

        # Check if file already exists
        if os.path.isfile("DNS/%s.txt" % d):
            print("File exists already. Aborting.")
            continue
            
        # Get DNS info
        dig_data = !dig +nocmd $d any +multiline +noall +answer
        dig_output = "\n".join(dig_data)
        
        # Write contents to file
        with open('DNS/%s.txt' % d, 'w') as outfile:
            outfile.write(dig_output)
            outfile.close()
        
        time.sleep(5)
        
# I'll only fetch the root domains and only the first 20. Feel free to uncomment this
# and adapt it to your needs.
#fetch_dns(uniq_subdomains)
fetch_dns(uniq_roots[:20])
Dig DNS records for	 altpress.org ...
Dig DNS records for	 nzfortress.co.nz ...
Dig DNS records for	 evillasforsale.com ...
Dig DNS records for	 playingenemy.com ...
Dig DNS records for	 richardsonscharts.com ...
Dig DNS records for	 xenith.net ...
Dig DNS records for	 tdbrecords.com ...
Dig DNS records for	 electrichumanproject.com ...
Dig DNS records for	 tweekerchick.blogspot.com ...
Dig DNS records for	 besound.com ...
Dig DNS records for	 porkchopscreenprinting.com ...
Dig DNS records for	 kinseyvisual.com ...
Dig DNS records for	 rathergood.com ...
Dig DNS records for	 lepoint.fr ...
Dig DNS records for	 revhq.com ...
Dig DNS records for	 poprocksandcoke.com ...
Dig DNS records for	 samuraiblue.com ...
Dig DNS records for	 openbsd.org ...
Dig DNS records for	 sysblog.com ...
Dig DNS records for	 voicesofsafety.com ...

Read WHOIS information

After collecting the data I’ll try to manipulate data in a pythonic way in order to export it later to some useful format like Excel. I’ll therefor read the collected data from every single file, merge the data and create a DataFrame.

# <!-- collapse=True -->
from pprint import pprint

# Global DF frames
frames = []

def read_whois(domains):
    for d in domains:
        print("Reading WHOIS for\t %s" % d)
        
        try:
            with open('WHOIS/%s.json' % d, 'r') as inputfile:
                whois = json.loads(json.load(inputfile))

                # Delete raw record
                whois.pop('raw', None)

                data = []
                
                # Iterate contacts -> tech
                if whois['contacts']['tech']:
                    for i in whois['contacts']['tech']:
                        data.append([d, 'contacts', 'tech', i, whois['contacts']['tech'][i]])

                # Iterate contacts -> admin
                if whois['contacts']['admin']:
                    for i in whois['contacts']['admin']:
                        data.append([d, 'contacts', 'admin', i, whois['contacts']['admin'][i]])

                # Nameservers
                if "nameservers" in whois:
                    for i in whois['nameservers']:
                        data.append([d, 'nameservers', '', '', i])

                # Create DF only if data is not empty
                if data:
                    df = pd.DataFrame(data, columns=['domain', 'element', 'type', 'field', 'value'])
                    frames.append(df)

                # Close file
                inputfile.close()
        except:
            print("[ERROR] Couldn't read WHOIS for\t %s" % d)

#read_whois(uniq_subdomains)
read_whois(uniq_roots[:20])
Reading WHOIS for	 altpress.org
Reading WHOIS for	 nzfortress.co.nz
Reading WHOIS for	 evillasforsale.com
Reading WHOIS for	 playingenemy.com
Reading WHOIS for	 richardsonscharts.com
Reading WHOIS for	 xenith.net
Reading WHOIS for	 tdbrecords.com
Reading WHOIS for	 electrichumanproject.com
Reading WHOIS for	 tweekerchick.blogspot.com
Reading WHOIS for	 besound.com
Reading WHOIS for	 porkchopscreenprinting.com
Reading WHOIS for	 kinseyvisual.com
Reading WHOIS for	 rathergood.com
Reading WHOIS for	 lepoint.fr
Reading WHOIS for	 revhq.com
Reading WHOIS for	 poprocksandcoke.com
Reading WHOIS for	 samuraiblue.com
Reading WHOIS for	 openbsd.org
Reading WHOIS for	 sysblog.com
Reading WHOIS for	 voicesofsafety.com
df_whois = pd.concat(frames)
df_whois.set_index(['domain', 'element', 'type', 'field'])

Read DNS information

Do the same with the DNS files…

# <!-- collapse=True -->
from pprint import pprint
import re
import traceback

# Global DF frames
frames = []

def read_dns(domains):
    for d in domains:
        print("Reading WHOIS for\t %s" % d)
        data = []
        try:
            with open('DNS/%s.txt' % d, 'r') as inputfile:
                dns = inputfile.read()
                
                for l in dns.splitlines():
                    records = l.split()
                    
                    # Check only for NS, MX, A, CNAME, TXT
                    a = re.compile("^(NS|MX|A|CNAME|TXT)$")
                    if len(records) >= 4:
                        if a.match(records[3]):
                            data.append([d, records[3], records[4]])
                
                # Create DF only if data is not empty
                if data:
                    df = pd.DataFrame(data, columns=['domain', 'dns_record', 'value'])
                    frames.append(df)      
                    
                # Close file
                inputfile.close()
                
        except Exception, err:
            print("[ERROR] Couldn't read WHOIS for\t %s" % d)
            traceback.print_exc()

#read_dns(uniq_subdomains)            
read_dns(uniq_roots[:20])
Reading WHOIS for	 altpress.org
Reading WHOIS for	 nzfortress.co.nz
Reading WHOIS for	 evillasforsale.com
Reading WHOIS for	 playingenemy.com
Reading WHOIS for	 richardsonscharts.com
Reading WHOIS for	 xenith.net
Reading WHOIS for	 tdbrecords.com
Reading WHOIS for	 electrichumanproject.com
Reading WHOIS for	 tweekerchick.blogspot.com
Reading WHOIS for	 besound.com
Reading WHOIS for	 porkchopscreenprinting.com
Reading WHOIS for	 kinseyvisual.com
Reading WHOIS for	 rathergood.com
Reading WHOIS for	 lepoint.fr
Reading WHOIS for	 revhq.com
Reading WHOIS for	 poprocksandcoke.com
Reading WHOIS for	 samuraiblue.com
Reading WHOIS for	 openbsd.org
Reading WHOIS for	 sysblog.com
Reading WHOIS for	 voicesofsafety.com
df_dns = pd.concat(frames)
df_dns.set_index(['domain', 'dns_record'])

Connect to targets

For every single target I’ll connect to it per HTTP(s) using urllib2 and store the HTTP headers.

# <!-- collapse=True -->
import urllib2
import httplib


c_targets = [t for t in targets['Target'][:20]]
frames = []

# Collect here all URLs failed to connect to
error_urls = []

def send_request(target, data):
    """ Sends a single request to the target """            
    
    # Set own headers
    headers = {'User-Agent' : 'Mozilla 5.10'}

    # Create request
    request = urllib2.Request(target, None, headers)
    
    # Default response
    response = None
        
    try:
        # Send request
        response = urllib2.urlopen(request, timeout=5)
        
        # Add headers
        for h in response.info():
            data.append([target, response.code, h, response.info()[h]])
        
    except urllib2.HTTPError, e:
        print('[ERROR] HTTPError = ' + str(e.code))
        data.append([target, e.code, '', ''])
            
    except urllib2.URLError, e:
        print('[ERROR] URLError = ' + str(e.reason))
        data.append([target, e.reason, '', ''])
            
    except ValueError, e:
        # Most probably the target didn't have any schema
        # So send the request again with HTTP
        error_urls.append(target)
        print('[ERROR] ValueError = ' + e.message)
            
    except httplib.HTTPException, e:
        print('[ERROR] HTTPException')
            
    except Exception:
        import traceback
        print('[ERROR] Exception: ' + traceback.format_exc())
        
    finally:
        return response
        

    
    
def open_connection(targets):
    """ Iterate through targets and send requests """
    data = []
    for t in targets:
        print("Connecting to\t %s" % t)
        
        response = send_request(t, data)
        
    # Create DF only if data is not empty
    if data:
        df = pd.DataFrame(data, columns=['url', 'response', 'header', 'value'])
        frames.append(df)    
        

# Open connection to targets and collect information
open_connection(c_targets)

# If there are any urls not having been tested, then
# prepend http:// to <target> and run again
new_targets =  ["http://"+u for u in error_urls]
open_connection(new_targets)
Connecting to	 http://www.altpress.org/
Connecting to	 http://www.nzfortress.co.nz
Connecting to	 http://www.evillasforsale.com
Connecting to	 http://www.playingenemy.com/
[ERROR] URLError = timed out
Connecting to	 http://www.richardsonscharts.com
Connecting to	 http://www.xenith.net
[ERROR] Exception: Traceback (most recent call last):
  File "<ipython-input-19-d057092f77b5>", line 26, in send_request
    response = urllib2.urlopen(request, timeout=5)
  File "/usr/lib/python2.7/urllib2.py", line 127, in urlopen
    return _opener.open(url, data, timeout)
  File "/usr/lib/python2.7/urllib2.py", line 401, in open
    response = self._open(req, data)
  File "/usr/lib/python2.7/urllib2.py", line 419, in _open
    '_open', req)
  File "/usr/lib/python2.7/urllib2.py", line 379, in _call_chain
    result = func(*args)
  File "/usr/lib/python2.7/urllib2.py", line 1211, in http_open
    return self.do_open(httplib.HTTPConnection, req)
  File "/usr/lib/python2.7/urllib2.py", line 1184, in do_open
    r = h.getresponse(buffering=True)
  File "/usr/lib/python2.7/httplib.py", line 1034, in getresponse
    response.begin()
  File "/usr/lib/python2.7/httplib.py", line 407, in begin
    version, status, reason = self._read_status()
  File "/usr/lib/python2.7/httplib.py", line 365, in _read_status
    line = self.fp.readline()
  File "/usr/lib/python2.7/socket.py", line 447, in readline
    data = self._sock.recv(self._rbufsize)
timeout: timed out

Connecting to	 http://www.tdbrecords.com
Connecting to	 http://www.electrichumanproject.com/
Connecting to	 http://tweekerchick.blogspot.com/
Connecting to	 http://www.besound.com/pushead/home.html
Connecting to	 http://www.porkchopscreenprinting.com/
Connecting to	 http://www.kinseyvisual.com
Connecting to	 http://www.rathergood.com
Connecting to	 http://www.lepoint.fr/
Connecting to	 http://www.revhq.com
Connecting to	 http://www.poprocksandcoke.com
Connecting to	 http://www.samuraiblue.com/
Connecting to	 http://www.openbsd.org/cgi-bin/man.cgi
Connecting to	 http://www.sysblog.com
Connecting to	 http://www.voicesofsafety.com
df_connection = pd.concat(frames)
df_connection.set_index(['url', 'response', 'header'])

Save to Excel

Now feel free to do whatever you want with your DataFrames: Export them to CSV, EXCEL, TXT etc.

from pandas import ExcelWriter
writer = ExcelWriter('Excel/output.xls')
df_whois.to_excel(writer, "Sheet - WHOIS")
df_dns.to_excel(writer, "Sheet - DNS")
#df_connection.to_excel(writer, "Sheet - Connections")

Since I wasn’t able to export the df_connection to Excel (Exception: Unexpected data type <class 'socket.timeout'>) I had to export it to CSV:

df_connection.to_csv("Excel/connection.csv", sep="\t", header=True)