Suppose you get a list of some URLs and you are asked to “investigate” them. The list is full of some random URLs related to your company and nobody knows about. You don’t have a clue who is responsible for them nor which applications (if any) are running behind them. Sounds like a cool task, ugh?

Well in today’s post I’ll show you how I’ve managed it to minimize the process of analyzing each URL manually and saved me a lot of time automatizing things.

Setup environment

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34


%pylab inline
# <!-- collapse=True -->
import binascii
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
import datetime as dt
import time
import ipy_table
import dnslib
import pythonwhois
import urlparse
import tldextract
import json
import os
import sys
import urllib2


from yurl import URL
from urlparse import urlparse
from IPython.display import display_pretty, display_html, display_jpeg, display_png, display_json, display_latex, display_svg

# Ipython settings
pd.set_option('display.height', 1000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 100)
pd.set_option('display.width', 3000)
pd.set_option('display.column_space', 1000)

# Change working directory
os.chdir("/root/work/appsurvey")

Populating the interactive namespace from numpy and matplotlib
height has been deprecated.

First I’ll read the list of targets from some CSV file.

1
2
3
4
5
6
7
8


# Fetch list of random URLs (found using Google)
response = urllib2.urlopen('http://files.ianonavy.com/urls.txt')
targets_row = response.read()

# Create DataFrame
targets = pd.DataFrame([t for t in targets_row.splitlines()], columns=["Target"])
print("First 20 entries in the targets list")
targets[:20]

First 20 entries in the targets list

	Target
0	http://www.altpress.org/
1	http://www.nzfortress.co.nz
2	http://www.evillasforsale.com
3	http://www.playingenemy.com/
4	http://www.richardsonscharts.com
5	http://www.xenith.net
6	http://www.tdbrecords.com
7	http://www.electrichumanproject.com/
8	http://tweekerchick.blogspot.com/
9	http://www.besound.com/pushead/home.html
10	http://www.porkchopscreenprinting.com/
11	http://www.kinseyvisual.com
12	http://www.rathergood.com
13	http://www.lepoint.fr/
14	http://www.revhq.com
15	http://www.poprocksandcoke.com
16	http://www.samuraiblue.com/
17	http://www.openbsd.org/cgi-bin/man.cgi
18	http://www.sysblog.com
19	http://www.voicesofsafety.com

Now I’ll split the URLs in several parts:

schema (HTTP, FTP, SSH etc.)
host
port
path
query (?q=somevalue etc.)

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26


# <!-- collapse=True -->
# Join root domain + suffix
extract_root_domain =  lambda x: '.'.join(tldextract.extract(x)[1:3])

target_columns = ['scheme', 'userinfo', 'host', 'port', 'path', 'query', 'fragment', 'decoded']
target_component = [list(URL(t)) for t in targets['Target']]

df_targets = pd.DataFrame(target_component, columns=target_columns)
empty_hosts = df_targets[df_targets['host'] == '']

# Copy path information to host
for index,row in empty_hosts.iterrows():
    df_targets.ix[index:index]['host'] = df_targets.ix[index:index]['path']
    df_targets.ix[index:index]['path'] = ''
    
# Extract root tld
df_targets['root_domain'] = df_targets['host'].apply(extract_root_domain)

# Drop unnecessary columns
df_targets.drop(['query', 'fragment', 'decoded'], axis=1, inplace=True)

# Write df to file (for later use)
df_targets.to_csv("targets_df.csv", sep="\t")

print("First 20 Entries")
df_targets[:20]

First 20 Entries

	scheme	host	path	root_domain
0	http	www.altpress.org	/	altpress.org
1	http	www.nzfortress.co.nz		nzfortress.co.nz
2	http	www.evillasforsale.com		evillasforsale.com
3	http	www.playingenemy.com	/	playingenemy.com
4	http	www.richardsonscharts.com		richardsonscharts.com
5	http	www.xenith.net		xenith.net
6	http	www.tdbrecords.com		tdbrecords.com
7	http	www.electrichumanproject.com	/	electrichumanproject.com
8	http	tweekerchick.blogspot.com	/	tweekerchick.blogspot.com
9	http	www.besound.com	/pushead/home.html	besound.com
10	http	www.porkchopscreenprinting.com	/	porkchopscreenprinting.com
11	http	www.kinseyvisual.com		kinseyvisual.com
12	http	www.rathergood.com		rathergood.com
13	http	www.lepoint.fr	/	lepoint.fr
14	http	www.revhq.com		revhq.com
15	http	www.poprocksandcoke.com		poprocksandcoke.com
16	http	www.samuraiblue.com	/	samuraiblue.com
17	http	www.openbsd.org	/cgi-bin/man.cgi	openbsd.org
18	http	www.sysblog.com		sysblog.com
19	http	www.voicesofsafety.com		voicesofsafety.com

Whois

Now get WHOIS information based on data in df_targets:

1
2
3
4


%%bash
if [ ! -d "WHOIS" ]; then
    mkdir WHOIS
fi

1
2
3


# Get unique values
uniq_roots = df_targets['root_domain'].unique()
uniq_subdomains = df_targets['host'].unique()

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38


# <!-- collapse=True -->

def date_handler(obj):
    return obj.isoformat() if hasattr(obj, 'isoformat') else obj

target_whois = {}

def fetch_whois(domains):
    """ Fetch WHOIS information for specified domains (list) """
    for d in domains:
        print("Get WHOIS for\t %s ..." % d)

        # Check if file already exists
        if os.path.isfile("WHOIS/%s.json" % d):
            print("File exists already. Aborting.")
            continue

        try:
            # Get whois information
            whois_data = pythonwhois.get_whois(d)

            # Convert to JSON$
            json_data = json.dumps(whois_data, default=date_handler)

            # Write contents to file
            with open('WHOIS/%s.json' % d, 'w') as outfile:
              json.dump(json_data, outfile)

            # Sleep for 20s    
            time.sleep(20)
        except:
            print("[ERROR] Couldn't retrieve WHOIS for\t %s" % d)
            
# I'll only fetch the root domains and only the first 20. Feel free to uncomment this
# and adapt it to your needs.
#fetch_whois(uniq_subdomains)
fetch_whois(uniq_roots[:20])
    

Get WHOIS for	 altpress.org ...
Get WHOIS for	 nzfortress.co.nz ...
Get WHOIS for	 evillasforsale.com ...
Get WHOIS for	 playingenemy.com ...
Get WHOIS for	 richardsonscharts.com ...
Get WHOIS for	 xenith.net ...
Get WHOIS for	 tdbrecords.com ...
Get WHOIS for	 electrichumanproject.com ...
Get WHOIS for	 tweekerchick.blogspot.com ...
Get WHOIS for	 besound.com ...
Get WHOIS for	 porkchopscreenprinting.com ...
Get WHOIS for	 kinseyvisual.com ...
Get WHOIS for	 rathergood.com ...
Get WHOIS for	 lepoint.fr ...
Get WHOIS for	 revhq.com ...
Get WHOIS for	 poprocksandcoke.com ...
Get WHOIS for	 samuraiblue.com ...
Get WHOIS for	 openbsd.org ...
Get WHOIS for	 sysblog.com ...
Get WHOIS for	 voicesofsafety.com ...

Get all DNS records

1
2
3
4


%%bash
if [ ! -d "DNS" ]; then
    mkdir DNS
fi

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26


# <!-- collapse=True -->
def fetch_dns(domains):
    """ Fetch all DNS records for specified domains (list) """
    for d in domains:
        print("Dig DNS records for\t %s ..." % d)

        # Check if file already exists
        if os.path.isfile("DNS/%s.txt" % d):
            print("File exists already. Aborting.")
            continue
            
        # Get DNS info
        dig_data = !dig +nocmd $d any +multiline +noall +answer
        dig_output = "\n".join(dig_data)
        
        # Write contents to file
        with open('DNS/%s.txt' % d, 'w') as outfile:
            outfile.write(dig_output)
            outfile.close()
        
        time.sleep(5)
        
# I'll only fetch the root domains and only the first 20. Feel free to uncomment this
# and adapt it to your needs.
#fetch_dns(uniq_subdomains)
fetch_dns(uniq_roots[:20])

Dig DNS records for	 altpress.org ...
Dig DNS records for	 nzfortress.co.nz ...
Dig DNS records for	 evillasforsale.com ...
Dig DNS records for	 playingenemy.com ...
Dig DNS records for	 richardsonscharts.com ...
Dig DNS records for	 xenith.net ...
Dig DNS records for	 tdbrecords.com ...
Dig DNS records for	 electrichumanproject.com ...
Dig DNS records for	 tweekerchick.blogspot.com ...
Dig DNS records for	 besound.com ...
Dig DNS records for	 porkchopscreenprinting.com ...
Dig DNS records for	 kinseyvisual.com ...
Dig DNS records for	 rathergood.com ...
Dig DNS records for	 lepoint.fr ...
Dig DNS records for	 revhq.com ...
Dig DNS records for	 poprocksandcoke.com ...
Dig DNS records for	 samuraiblue.com ...
Dig DNS records for	 openbsd.org ...
Dig DNS records for	 sysblog.com ...
Dig DNS records for	 voicesofsafety.com ...

Read WHOIS information

After collecting the data I’ll try to manipulate data in a pythonic way in order to export it later to some useful format like Excel. I’ll therefor read the collected data from every single file, merge the data and create a DataFrame.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46


# <!-- collapse=True -->
from pprint import pprint

# Global DF frames
frames = []

def read_whois(domains):
    for d in domains:
        print("Reading WHOIS for\t %s" % d)
        
        try:
            with open('WHOIS/%s.json' % d, 'r') as inputfile:
                whois = json.loads(json.load(inputfile))

                # Delete raw record
                whois.pop('raw', None)

                data = []
                
                # Iterate contacts -> tech
                if whois['contacts']['tech']:
                    for i in whois['contacts']['tech']:
                        data.append([d, 'contacts', 'tech', i, whois['contacts']['tech'][i]])

                # Iterate contacts -> admin
                if whois['contacts']['admin']:
                    for i in whois['contacts']['admin']:
                        data.append([d, 'contacts', 'admin', i, whois['contacts']['admin'][i]])

                # Nameservers
                if "nameservers" in whois:
                    for i in whois['nameservers']:
                        data.append([d, 'nameservers', '', '', i])

                # Create DF only if data is not empty
                if data:
                    df = pd.DataFrame(data, columns=['domain', 'element', 'type', 'field', 'value'])
                    frames.append(df)

                # Close file
                inputfile.close()
        except:
            print("[ERROR] Couldn't read WHOIS for\t %s" % d)

#read_whois(uniq_subdomains)
read_whois(uniq_roots[:20])

Reading WHOIS for	 altpress.org
Reading WHOIS for	 nzfortress.co.nz
Reading WHOIS for	 evillasforsale.com
Reading WHOIS for	 playingenemy.com
Reading WHOIS for	 richardsonscharts.com
Reading WHOIS for	 xenith.net
Reading WHOIS for	 tdbrecords.com
Reading WHOIS for	 electrichumanproject.com
Reading WHOIS for	 tweekerchick.blogspot.com
Reading WHOIS for	 besound.com
Reading WHOIS for	 porkchopscreenprinting.com
Reading WHOIS for	 kinseyvisual.com
Reading WHOIS for	 rathergood.com
Reading WHOIS for	 lepoint.fr
Reading WHOIS for	 revhq.com
Reading WHOIS for	 poprocksandcoke.com
Reading WHOIS for	 samuraiblue.com
Reading WHOIS for	 openbsd.org
Reading WHOIS for	 sysblog.com
Reading WHOIS for	 voicesofsafety.com

1

df_whois = pd.concat(frames)

1

df_whois.set_index(['domain', 'element', 'type', 'field'])

				value
domain	element	type	field
altpress.org	contacts	tech	city	Baltimore
			handle	AB10045-GANDI
			name	a.h.s. boy
			country	US
			phone	+1.4102358565
			state	MD
			street	2710 N. Calvert St
			postalcode	21218
			organization	dada typo
			email	29bcde81a3c0e645a9f2a60290ecf2df-1566139@contact.gandi.net
		admin	city	Baltimore
			handle	AB10045-GANDI
			name	a.h.s. boy
			country	US
			phone	+1.4102358565
			state	MD
			street	2710 N. Calvert St
			postalcode	21218
			organization	dada typo
			email	29bcde81a3c0e645a9f2a60290ecf2df-1566139@contact.gandi.net
	nameservers			DNS.NOTHINGNESS.ORG
	nameservers			DNS.DADATYPO.NET
evillasforsale.com	contacts	tech	city	Manchester
			name	Andy Deakin
			country	GB
			phone	+44.1616605550
			state	Greater Manchester
			street	66 Grosvenor St Denton
			postalcode	M34 3GA
			organization	PCmend.net Computer Solutions Limited
			email	domains@pcmend.net
		admin	city	Manchester
			name	Andy Deakin
			country	GB
			phone	+44.1616605550
			state	Greater Manchester
			street	66 Grosvenor St Denton
			postalcode	M34 3GA
			organization	PCmend.net Computer Solutions Limited
			email	domains@pcmend.net
	nameservers			NS1.PCMEND.NET
	nameservers			NS2.PCMEND.NET
playingenemy.com	nameservers			ns04.a2z-server.jp
playingenemy.com	nameservers			dns04.a2z-server.jp
richardsonscharts.com	contacts	tech	city	New Bedford
			fax	+1.5089926604
			name	Garrity, Christopher
			country	US
			phone	+1.8888396604
			state	MA
			street	90 Hatch Street, 1st Floor
			postalcode	02745
			organization	null
			email	cgarrity@maptech.com
		admin	city	New Bedford
			fax	+1.5089926604
			name	Estes, Lee
			country	US
			phone	+1.8888396604
			state	MA
			street	90 Hatch Street, 1st Floor
			postalcode	02745
			organization	null
			email	richcharts@aol.com
	nameservers			NS2.TERENCENET.NET
	nameservers			NS.TERENCENET.NET
xenith.net	contacts	tech	city	PALM SPRINGS
			fax	+1.7603255504
			name	DNS Admin
			country	US
			phone	+1.7603254755
			state	CA
			street	1001 S PALM CANYON DR STE 217
			postalcode	92264-8349
			organization	DNS Admin
			email	dns@ADVANCEDMINDS.COM
		admin	city	San Luis Obispo
			fax	+1.7345724470
			name	Phelan, Kelly
			country	US
			phone	+1.7349456066
			state	CA
			street	777 Mill St Apt 6
			postalcode	93401
			organization	null
			email	centaurus7@AOL.COM
	nameservers			NS2.WEST-DATACENTER.NET
	nameservers			NS1.WEST-DATACENTER.NET
tdbrecords.com	contacts	tech	city	Boston
			name	Jonah Livingston
			country	United States
			phone	6172308529
			state	Massachusetts
			street	902 Huntington ave
			postalcode	02115
			organization	TDB Records
			email	bloodbathrecords@aol.com
		admin	city	Boston
			name	Jonah Livingston
			country	United States
			phone	6172308529
			state	Massachusetts
			street	902 Huntington ave
			postalcode	02115
			organization	TDB Records
			email	bloodbathrecords@aol.com
	nameservers			NS1.DREAMHOST.COM
				NS2.DREAMHOST.COM
				NS3.DREAMHOST.COM
electrichumanproject.com	contacts	tech	city	Tsukuba
			name	840Domains Tsukuba 840Domains
			country	Japan
			phone	+81.5055349763
			state	Ibaraki
			street	Baien 2-1-15\nSupuringutekku Tsukuba bld. 401
			postalcode	305-0045
			organization	Tsukuba
			email	domain_resister@yahoo.co.jp
		admin	city	Tsukuba
			name	840Domains Tsukuba 840Domains
			country	Japan
			phone	+81.5055349763
			state	Ibaraki
			street	Baien 2-1-15\nSupuringutekku Tsukuba bld. 401
			postalcode	305-0045
			organization	Tsukuba
			email	domain_resister@yahoo.co.jp
	nameservers			SNS41.WEBSITEWELCOME.COM
	nameservers			SNS42.WEBSITEWELCOME.COM
besound.com	contacts	tech	city	San Diego
			fax	858-450-0567
			country	United States
			phone	858-458-0490
			state	California
			street	5266 Eastgate Mall
			postalcode	92121
			organization	A+Net
			email	dns@abac.com
		admin	city	LINDENHURST
			fax	999 999 9999
			name	Richard Lopez
			country	United States
			phone	(516) 226-8430
			state	New York
			street	180 34TH ST
			postalcode	11757-3243
			organization	BeSound Multimedia
			email	besound@optonline.net
	nameservers			BDNS.CV.SITEPROTECT.COM
	nameservers			ADNS.CV.SITEPROTECT.COM
porkchopscreenprinting.com	contacts	tech	city	New York
			name	Domain Registrar
			country	US
			phone	+1.9027492701
			state	NY
			street	575 8th Avenue 11th Floor
			postalcode	10018
			organization	Register.Com
		admin	city	Seattle
			name	Damon Baldwin
			country	US
			phone	+1.2067064764
			state	WA
			street	9218 9th ave NW
			postalcode	98117
			organization	Pork Chop Screen Printing
	nameservers			ns1.hosting-advantage.com
	nameservers			ns2.hosting-advantage.com
kinseyvisual.com	contacts	tech	city	Culver City
			fax	+1.8186498230
			name	ADMINISTRATOR, DOMAIN
			country	US
			phone	+1.8775784000
			state	CA
			street	8520 National Blvd. #A
			postalcode	90232
			organization	Media Temple
			email	dnsadmin@MEDIATEMPLE.NET
		admin	city	SAN DIEGO
			fax	+1.6195449594
			name	Kinsey, Dave
			country	US
			phone	+1.6195449595
			state	CA
			street	705 12TH AVE
			postalcode	92101-6507
			organization	BlkMkrt Inc.
			email	dave@BLKMRKT.COM
	nameservers			NS1.MEDIATEMPLE.NET
	nameservers			NS2.MEDIATEMPLE.NET
rathergood.com	contacts	tech	city	London
			fax	+1.9999999999
			name	Veitch, Joel
			country	UK
			phone	+1.08072547734
			state	null
			street	10 Croston Street
			postalcode	null
			organization	null
			email	joel@rathergood.com
		admin	city	London
			fax	+1.9999999999
			name	Veitch, Joel
			country	UK
			phone	+1.08072547734
			state	null
			street	10 Croston Street
			postalcode	null
			organization	null
			email	joel@rathergood.com
	nameservers			NS1.DREAMHOST.COM
				NS3.DREAMHOST.COM
				NS2.DREAMHOST.COM
lepoint.fr	contacts	tech	city	Paris
			handle	GR283-FRNIC
			name	GANDI ROLE
			country	FR
			street	Gandi\n15, place de la Nation
			postalcode	75011
			type	ROLE
			email	noc@gandi.net
			changedate	2006-03-03T00:00:00
		admin	city	Paris
			handle	SDED175-FRNIC
			name	SOCIETE D'EXPLOITATION DE L'HEBDOMADAIRE LE POINT
			country	FR
			phone	+33 1 44 10 10 10
			street	74, avenue du maine
			postalcode	75014
			type	ORGANIZATION
			email	b396c2138803c796a2cc37d347a1797c-857941@contact.gandi.net
			changedate	2013-07-10T00:00:00
	nameservers			b.dns.gandi.net
				a.dns.gandi.net
				c.dns.gandi.net
revhq.com	contacts	tech	city	HUNTINGTON BEACH
			fax	+1.5555555555
			name	JORDAN COOPER
			country	US
			phone	+1.7148427584
			state	CA
			street	P.O. BOX 5232
			postalcode	92615
			organization	REV DISTRIBUTION
			email	JCOOPER@REVHQ.COM
		admin	city	HUNTINGTON BEACH
			fax	+1.5555555555
			name	JORDAN COOPER
			country	US
			phone	+1.7148427584
			state	CA
			street	P.O. BOX 5232
			postalcode	92615
			organization	REV DISTRIBUTION
			email	JCOOPER@REVHQ.COM
	nameservers			NS1.CLOUDNS.NET
				NS2.CLOUDNS.NET
				NS3.CLOUDNS.NET
				NS4.CLOUDNS.NET
poprocksandcoke.com	contacts	tech	city	Ljubljana
			name	Matija Zajec
			country	Slovenia
			phone	+386.30363699
			state	Osrednjeslovenska
			street	Krizevniska ulica 7
			postalcode	1000
			email	kukmak@gmail.com
		admin	city	Ljubljana
			name	Matija Zajec
			country	Slovenia
			phone	+386.30363699
			state	Osrednjeslovenska
			street	Krizevniska ulica 7
			postalcode	1000
			email	kukmak@gmail.com
	nameservers			NS3.WEBDNS.PW
	nameservers			NS4.WEBDNS.PW
samuraiblue.com	contacts	tech	city	Louisville
			fax	+1.5025692774
			name	MaximumASP, LLC
			country	US
			phone	+1.5025692771
			state	KY
			street	540 Baxter Avenue
			postalcode	40204
			organization	MaximumASP, LLC
			email	noc@maximumasp.com
		admin	city	Tampa
			fax	+1.9999999999
			name	Meronek, Rob
			country	US
			phone	+1.838575819
			state	FL
			street	777 North Ashley Drive #1212
			postalcode	33602
			organization	The Boardr
			email	rob@meronek.com
	nameservers			DNS1.MIDPHASE.COM
	nameservers			DNS2.MIDPHASE.COM
openbsd.org	contacts	tech	city	Calgary Alberta
			handle	CR32086106
			name	Theos Software
			country	CA
			phone	+1.40323798
			state	Alberta
			street	812 23rd ave SE
			postalcode	T2G 1N8
			organization	Theos Software
			email	deraadt@theos.com
		admin	city	Calgary
			handle	CR32086107
			name	Theo de Raadt
			country	CA
			phone	+1.4032379834
			state	Alberta
			street	812 23rd Ave SE
			postalcode	T2G1N8
			organization	Theos Software
			email	deraadt@theos.com
	nameservers			NS1.TELSTRA.NET
				NS.SIGMASOFT.COM
				NS1.SUPERBLOCK.NET
				NS2.SUPERBLOCK.NET
				ZEUS.THEOS.COM
				C.NS.BSWS.DE
				A.NS.BSWS.DE
sysblog.com	contacts	tech	city	Waltham
			fax	+1.7818392801
			name	Toll Free: 866-822-9073 Worldwide: 339-222-5132 This Domain For Sale
			country	US
			phone	+1.8668229073
			state	MA
			street	738 Main Street #389
			postalcode	02451
			organization	BuyDomains.com
			email	brokerage@buydomains.com
		admin	city	Waltham
			fax	+1.7818392801
			name	Toll Free: 866-822-9073 Worldwide: 339-222-5132 This Domain For Sale
			country	US
			phone	+1.8668229073
			state	MA
			street	738 Main Street #389
			postalcode	02451
			organization	BuyDomains.com
			email	brokerage@buydomains.com
	nameservers			NS.BUYDOMAINS.COM
	nameservers			THIS-DOMAIN-FOR-SALE.COM
voicesofsafety.com	contacts	tech	city	Burlington
			fax	+1.782722915
			name	BizLand.com, Inc.
			country	US
			phone	+1.782725585
			state	MA
			street	121 Middlesex Turnpike
			postalcode	01803
			organization	BizLand.com, Inc.
			email	DomReg@BIZLAND-INC.COM
		admin	city	NORTHE COLDWELL
			fax	+1.9732280276
			name	VOICESOFSAFTY INT'L
			country	US
			phone	+1.9732282258
			state	NJ
			street	264 park ave
			postalcode	07006
			organization	VOICESOFSAFTY INT'L
			email	webmaster@voicesofsafety.com
	nameservers			CLICKME2.CLICK2SITE.COM
	nameservers			CLICKME.CLICK2SITE.COM

Read DNS information

Do the same with the DNS files…

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39


# <!-- collapse=True -->
from pprint import pprint
import re
import traceback

# Global DF frames
frames = []

def read_dns(domains):
    for d in domains:
        print("Reading WHOIS for\t %s" % d)
        data = []
        try:
            with open('DNS/%s.txt' % d, 'r') as inputfile:
                dns = inputfile.read()
                
                for l in dns.splitlines():
                    records = l.split()
                    
                    # Check only for NS, MX, A, CNAME, TXT
                    a = re.compile("^(NS|MX|A|CNAME|TXT)$")
                    if len(records) >= 4:
                        if a.match(records[3]):
                            data.append([d, records[3], records[4]])
                
                # Create DF only if data is not empty
                if data:
                    df = pd.DataFrame(data, columns=['domain', 'dns_record', 'value'])
                    frames.append(df)      
                    
                # Close file
                inputfile.close()
                
        except Exception, err:
            print("[ERROR] Couldn't read WHOIS for\t %s" % d)
            traceback.print_exc()

#read_dns(uniq_subdomains)            
read_dns(uniq_roots[:20])

Reading WHOIS for	 altpress.org
Reading WHOIS for	 nzfortress.co.nz
Reading WHOIS for	 evillasforsale.com
Reading WHOIS for	 playingenemy.com
Reading WHOIS for	 richardsonscharts.com
Reading WHOIS for	 xenith.net
Reading WHOIS for	 tdbrecords.com
Reading WHOIS for	 electrichumanproject.com
Reading WHOIS for	 tweekerchick.blogspot.com
Reading WHOIS for	 besound.com
Reading WHOIS for	 porkchopscreenprinting.com
Reading WHOIS for	 kinseyvisual.com
Reading WHOIS for	 rathergood.com
Reading WHOIS for	 lepoint.fr
Reading WHOIS for	 revhq.com
Reading WHOIS for	 poprocksandcoke.com
Reading WHOIS for	 samuraiblue.com
Reading WHOIS for	 openbsd.org
Reading WHOIS for	 sysblog.com
Reading WHOIS for	 voicesofsafety.com

1

df_dns = pd.concat(frames)

1

df_dns.set_index(['domain', 'dns_record'])

		value
domain	dns_record
altpress.org	NS	dns.dadatypo.net.
altpress.org	NS	dns.nothingness.org.
nzfortress.co.nz	NS	ns-1637.awsdns-12.co.uk.
	NS	ns-913.awsdns-50.net.
	NS	ns-203.awsdns-25.com.
	NS	ns-1284.awsdns-32.org.
evillasforsale.com	NS	ns2.pcmend.net.
evillasforsale.com	NS	ns1.pcmend.net.
playingenemy.com	NS	dns04.a2z-server.jp.
playingenemy.com	NS	ns04.a2z-server.jp.
richardsonscharts.com	NS	ns2.interbasix.net.
	A	207.97.239.35
	MX	10
	TXT	"v=spf1
	NS	ns.interbasix.net.
	MX	30
	MX	40
	MX	20
xenith.net	NS	ns1.west-datacenter.net.
	NS	ns2.west-datacenter.net.
	A	206.130.121.98
	MX	10
tdbrecords.com	NS	ns2.dreamhost.com.
	NS	ns1.dreamhost.com.
	MX	0
	NS	ns3.dreamhost.com.
	MX	0
	A	75.119.220.89
electrichumanproject.com	NS	sns41.websitewelcome.com.
	NS	sns42.websitewelcome.com.
	A	67.18.68.14
tweekerchick.blogspot.com	CNAME	blogspot.l.googleusercontent.com.
	A	173.194.44.10
	A	173.194.44.12
	A	173.194.44.11
besound.com	NS	bdns.cv.siteprotect.com.
besound.com	NS	adns.cv.siteprotect.com.
porkchopscreenprinting.com	NS	ns1.hosting-advantage.com.
	NS	ns2.hosting-advantage.com.
	A	64.92.121.42
	MX	5
kinseyvisual.com	A	205.186.183.161
	NS	ns1.mediatemple.net.
	MX	10
	NS	ns2.mediatemple.net.
rathergood.com	MX	0
	NS	ns2.dreamhost.com.
	NS	ns1.dreamhost.com.
	MX	0
	NS	ns3.dreamhost.com.
	A	64.90.57.150
lepoint.fr	NS	c.dns.gandi.net.
	NS	b.dns.gandi.net.
	NS	a.dns.gandi.net.
revhq.com	NS	ns1.cloudns.net.
	NS	ns4.cloudns.net.
	NS	ns3.cloudns.net.
	NS	ns2.cloudns.net.
poprocksandcoke.com	A	184.164.147.132
	MX	0
	NS	ns3.webdns.pw.
	NS	ns4.webdns.pw.
samuraiblue.com	NS	dns1.anhosting.com.
	NS	dns2.anhosting.com.
	MX	0
	TXT	"v=spf1
	A	174.127.110.249
openbsd.org	NS	c.ns.bsws.de.
	NS	ns2.superblock.net.
	A	129.128.5.194
	NS	a.ns.bsws.de.
	NS	ns1.superblock.net.
	NS	ns.sigmasoft.com.
	NS	ns1.telstra.net.
	NS	zeus.theos.com.
	MX	10
	MX	6
sysblog.com	MX	0
	A	66.151.181.49
	TXT	"v=spf1
voicesofsafety.com	NS	clickme.click2site.com.
voicesofsafety.com	NS	clickme2.click2site.com.

Connect to targets

For every single target I’ll connect to it per HTTP(s) using urllib2 and store the HTTP headers.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79


# <!-- collapse=True -->
import urllib2
import httplib


c_targets = [t for t in targets['Target'][:20]]
frames = []

# Collect here all URLs failed to connect to
error_urls = []

def send_request(target, data):
    """ Sends a single request to the target """            
    
    # Set own headers
    headers = {'User-Agent' : 'Mozilla 5.10'}

    # Create request
    request = urllib2.Request(target, None, headers)
    
    # Default response
    response = None
        
    try:
        # Send request
        response = urllib2.urlopen(request, timeout=5)
        
        # Add headers
        for h in response.info():
            data.append([target, response.code, h, response.info()[h]])
        
    except urllib2.HTTPError, e:
        print('[ERROR] HTTPError = ' + str(e.code))
        data.append([target, e.code, '', ''])
            
    except urllib2.URLError, e:
        print('[ERROR] URLError = ' + str(e.reason))
        data.append([target, e.reason, '', ''])
            
    except ValueError, e:
        # Most probably the target didn't have any schema
        # So send the request again with HTTP
        error_urls.append(target)
        print('[ERROR] ValueError = ' + e.message)
            
    except httplib.HTTPException, e:
        print('[ERROR] HTTPException')
            
    except Exception:
        import traceback
        print('[ERROR] Exception: ' + traceback.format_exc())
        
    finally:
        return response
        

    
    
def open_connection(targets):
    """ Iterate through targets and send requests """
    data = []
    for t in targets:
        print("Connecting to\t %s" % t)
        
        response = send_request(t, data)
        
    # Create DF only if data is not empty
    if data:
        df = pd.DataFrame(data, columns=['url', 'response', 'header', 'value'])
        frames.append(df)    
        

# Open connection to targets and collect information
open_connection(c_targets)

# If there are any urls not having been tested, then
# prepend http:// to <target> and run again
new_targets =  ["http://"+u for u in error_urls]
open_connection(new_targets)

Connecting to	 http://www.altpress.org/
Connecting to	 http://www.nzfortress.co.nz
Connecting to	 http://www.evillasforsale.com
Connecting to	 http://www.playingenemy.com/
[ERROR] URLError = timed out
Connecting to	 http://www.richardsonscharts.com
Connecting to	 http://www.xenith.net
[ERROR] Exception: Traceback (most recent call last):
  File "<ipython-input-19-d057092f77b5>", line 26, in send_request
    response = urllib2.urlopen(request, timeout=5)
  File "/usr/lib/python2.7/urllib2.py", line 127, in urlopen
    return _opener.open(url, data, timeout)
  File "/usr/lib/python2.7/urllib2.py", line 401, in open
    response = self._open(req, data)
  File "/usr/lib/python2.7/urllib2.py", line 419, in _open
    '_open', req)
  File "/usr/lib/python2.7/urllib2.py", line 379, in _call_chain
    result = func(*args)
  File "/usr/lib/python2.7/urllib2.py", line 1211, in http_open
    return self.do_open(httplib.HTTPConnection, req)
  File "/usr/lib/python2.7/urllib2.py", line 1184, in do_open
    r = h.getresponse(buffering=True)
  File "/usr/lib/python2.7/httplib.py", line 1034, in getresponse
    response.begin()
  File "/usr/lib/python2.7/httplib.py", line 407, in begin
    version, status, reason = self._read_status()
  File "/usr/lib/python2.7/httplib.py", line 365, in _read_status
    line = self.fp.readline()
  File "/usr/lib/python2.7/socket.py", line 447, in readline
    data = self._sock.recv(self._rbufsize)
timeout: timed out

Connecting to	 http://www.tdbrecords.com
Connecting to	 http://www.electrichumanproject.com/
Connecting to	 http://tweekerchick.blogspot.com/
Connecting to	 http://www.besound.com/pushead/home.html
Connecting to	 http://www.porkchopscreenprinting.com/
Connecting to	 http://www.kinseyvisual.com
Connecting to	 http://www.rathergood.com
Connecting to	 http://www.lepoint.fr/
Connecting to	 http://www.revhq.com
Connecting to	 http://www.poprocksandcoke.com
Connecting to	 http://www.samuraiblue.com/
Connecting to	 http://www.openbsd.org/cgi-bin/man.cgi
Connecting to	 http://www.sysblog.com
Connecting to	 http://www.voicesofsafety.com

1

df_connection = pd.concat(frames)

1

df_connection.set_index(['url', 'response', 'header'])

			value
url	response	header
http://www.altpress.org/	200	content-length	24576
		x-powered-by	PHP/5.2.4-2ubuntu5.27
		set-cookie	PHPSESSID=1498f60d82d31ec081debde379e605eb; path=/
		expires	Thu, 19 Nov 1981 08:52:00 GMT
		vary	Accept-Encoding
		server	Apache/2.2.8 (Ubuntu) PHP/5.2.4-2ubuntu5.27 with Suhosin-Patch mod_ssl/2.2.8 OpenSSL/0.9.8g
		last-modified	Wed, 06 Aug 2014 11:42:08 GMT
		connection	close
		etag	"8ea9fc88e045b56cd96e6fc8b487cbd9"
		pragma	no-cache
		cache-control	public,must-revalidate
		date	Wed, 06 Aug 2014 11:44:55 GMT
		content-type	text/html; charset=utf-8
http://www.nzfortress.co.nz	200	x-powered-by	PHP/5.3.10-1ubuntu3.6
		transfer-encoding	chunked
		set-cookie	bblastvisit=1407325495; expires=Thu, 06-Aug-2015 11:44:55 GMT; path=/, bblastactivity=0; expires...
		vary	Accept-Encoding,User-Agent
		server	Apache/2.2.22 (Ubuntu)
		connection	close
		x-ua-compatible	IE=7
		pragma	private
		cache-control	private
		date	Wed, 06 Aug 2014 11:44:55 GMT
		content-type	text/html; charset=ISO-8859-1
http://www.evillasforsale.com	200	content-length	14610
		accept-ranges	bytes
		vary	Accept-Encoding,User-Agent
		server	Apache/2
		last-modified	Thu, 21 Jan 2010 13:33:43 GMT
		connection	close
		etag	"2040cf7-3912-47dacc06c1bc0"
		date	Wed, 06 Aug 2014 11:46:01 GMT
		content-type	text/html
http://www.playingenemy.com/	timed out
http://www.richardsonscharts.com	200	x-powered-by	PleskLin
		transfer-encoding	chunked
		set-cookie	PHPSESSID=8cg77frbg8biv0ru8m7udb6877; path=/
		expires	Thu, 19 Nov 1981 08:52:00 GMT
		server	Apache
		connection	close
		pragma	no-cache
		cache-control	no-store, no-cache, must-revalidate, post-check=0, pre-check=0
		date	Wed, 06 Aug 2014 11:45:00 GMT
		content-type	text/html
http://www.tdbrecords.com	200	content-length	2600
		accept-ranges	bytes
		vary	Accept-Encoding
		server	Apache
		last-modified	Mon, 03 Oct 2011 00:02:54 GMT
		connection	close
		etag	"a28-4ae59b253c780"
		date	Wed, 06 Aug 2014 11:46:45 GMT
		content-type	text/html
http://www.electrichumanproject.com/	200	content-length	14683
		accept-ranges	bytes
		vary	Accept-Encoding
		server	Apache
		last-modified	Tue, 05 Aug 2014 18:19:00 GMT
		connection	close
		date	Wed, 06 Aug 2014 11:45:06 GMT
		content-type	text/html
http://tweekerchick.blogspot.com/	200	alternate-protocol	80:quic
		x-xss-protection	1; mode=block
		x-content-type-options	nosniff
		expires	Wed, 06 Aug 2014 11:45:06 GMT
		server	GSE
		last-modified	Wed, 06 Aug 2014 05:34:08 GMT
		connection	close
		etag	"d6b75768-8b38-4991-b414-a06cc4608563"
		cache-control	private, max-age=0
		date	Wed, 06 Aug 2014 11:45:06 GMT
		content-type	text/html; charset=UTF-8
http://www.besound.com/pushead/home.html	200	content-length	3870
		accept-ranges	bytes
		server	Apache
		last-modified	Fri, 09 Jun 2006 04:34:30 GMT
		connection	close
		etag	"f1e-415c31dd2c180"
		date	Wed, 06 Aug 2014 11:45:07 GMT
		content-type	text/html
http://www.porkchopscreenprinting.com/	200	content-length	11811
		set-cookie	HttpOnly;Secure
		accept-ranges	bytes
		expires	Wed, 06 Aug 2014 11:45:27 GMT
		server	Apache
		last-modified	Tue, 28 Aug 2012 17:44:17 GMT
		connection	close
		etag	"b893e5-2e23-503d0371"
		cache-control	max-age=20
		date	Wed, 06 Aug 2014 11:45:07 GMT
		content-type	text/html
http://www.kinseyvisual.com	200	x-powered-by	PHP/5.3.27
		transfer-encoding	chunked
		set-cookie	PHPSESSID=b5f9f0af80bf4e08f41eeb02be6e6ad1; path=/
		expires	Thu, 19 Nov 1981 08:52:00 GMT
		vary	User-Agent,Accept-Encoding
		server	Apache/2.2.22
		connection	close
		pragma	no-cache
		cache-control	no-store, no-cache, must-revalidate, post-check=0, pre-check=0
		date	Wed, 06 Aug 2014 11:45:08 GMT
		content-type	text/html
http://www.rathergood.com	200	transfer-encoding	chunked
		set-cookie	c6ef959f4780c6a62e86c7a2d2e5ccea=4ilfnp83k67evmmn281i9qcnu3; path=/
		vary	Accept-Encoding
		server	Apache
		connection	close
		pragma	no-cache
		cache-control	no-cache, max-age=0, no-cache
		date	Wed, 06 Aug 2014 11:45:08 GMT
		p3p	CP="NOI ADM DEV PSAi COM NAV OUR OTRo STP IND DEM"
		content-type	text/html; charset=utf-8
		x-mod-pagespeed	1.6.29.7-3566
http://www.lepoint.fr/	200	x-xss-protection	1; mode=block
		x-content-type-options	nosniff
		x-powered-by	PHP/5.5.9
		transfer-encoding	chunked
		vary	User-Agent,Accept-Encoding
		server	Apache/2.2.25 (Unix) PHP/5.5.9
		connection	close
		date	Wed, 06 Aug 2014 11:45:09 GMT
		x-frame-options	SAMEORIGIN
		content-type	text/html
http://www.revhq.com	200	x-powered-by	Atari TT posix / Python / php 5.3x
		transfer-encoding	chunked
		set-cookie	PHPSESSID=e1jmcg9c2pgbi9rhgcdkhq5ge4; path=/
		expires	Thu, 19 Nov 1981 08:52:00 GMT
		vary	Accept-Encoding
		server	Apache/2.2.22
		connection	close
		pragma	no-cache
		cache-control	no-store, no-cache, must-revalidate, post-check=0, pre-check=0
		date	Wed, 06 Aug 2014 11:45:19 GMT
		content-type	text/html
http://www.poprocksandcoke.com	200	x-powered-by	PHP/5.3.24
		transfer-encoding	chunked
		server	Apache
		connection	close
		date	Wed, 06 Aug 2014 11:45:10 GMT
		content-type	text/html; charset=UTF-8
		x-pingback	http://www.poprocksandcoke.com/xmlrpc.php
http://www.samuraiblue.com/	200	content-length	54005
		x-powered-by	PHP/5.4.31
		server	Apache
		connection	close
		date	Wed, 06 Aug 2014 11:45:12 GMT
		content-type	text/html; charset=UTF-8
		x-pingback	http://samuraiblue.com/xmlrpc.php
http://www.openbsd.org/cgi-bin/man.cgi	200	transfer-encoding	chunked
		server	Apache
		connection	close
		pragma	no-cache
		cache-control	no-cache
		date	Wed, 06 Aug 2014 11:45:13 GMT
		content-type	text/html; charset=utf-8
http://www.sysblog.com	200	content-length	48663
		x-varnish	718735313 718731229
		x-cache	HIT
		x-powered-by	PHP/5.3.16
		set-cookie	PHPSESSID=5vk936712pnke6t5ki26n9frf4; path=/
		accept-ranges	bytes
		expires	Thu, 19 Nov 1981 08:52:00 GMT
		server	Apache
		connection	close
		via	1.1 varnish
		pragma	no-cache
		cache-control	no-store, no-cache, must-revalidate, post-check=0, pre-check=0
		date	Wed, 06 Aug 2014 11:45:14 GMT
		content-type	text/html; charset=UTF-8
		age	40
http://www.voicesofsafety.com	200	content-length	20854
		accept-ranges	bytes, bytes
		server	Apache/2
		connection	close
		date	Wed, 06 Aug 2014 11:45:15 GMT
		content-type	text/html
		age	0

Save to Excel

Now feel free to do whatever you want with your DataFrames: Export them to CSV, EXCEL, TXT etc.

1
2
3
4
5


from pandas import ExcelWriter
writer = ExcelWriter('Excel/output.xls')
df_whois.to_excel(writer, "Sheet - WHOIS")
df_dns.to_excel(writer, "Sheet - DNS")
#df_connection.to_excel(writer, "Sheet - Connections")

Since I wasn’t able to export the df_connection to Excel (Exception: Unexpected data type <class 'socket.timeout'>) I had to export it to CSV:

1

df_connection.to_csv("Excel/connection.csv", sep="\t", header=True)