Suppose you get a list of some URLs and you are asked to "investigate" them. The list is full of some random URLs related to your company and nobody knows about. You don't have a clue who is responsible for them nor which applications (if any) are running behind them. Sounds like a cool task, ugh?
Well in today's post I'll show you how I've managed it to minimize the process of analyzing each URL manually and saved me a lot of time automatizing things.
Setup environment¶
%pylab inline
import binascii
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
import datetime as dt
import time
import ipy_table
import dnslib
import pythonwhois
import urlparse
import tldextract
import json
import os
import sys
import urllib2
from yurl import URL
from urlparse import urlparse
from IPython.display import display_pretty, display_html, display_jpeg, display_png, display_json, display_latex, display_svg
# Ipython settings
pd.set_option('display.height', 1000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 100)
pd.set_option('display.width', 3000)
pd.set_option('display.column_space', 1000)
# Change working directory
os.chdir("/root/work/appsurvey")
Populating the interactive namespace from numpy and matplotlib height has been deprecated.
First I'll read the list of targets from some CSV file.
# Fetch list of random URLs (found using Google)
response = urllib2.urlopen('http://files.ianonavy.com/urls.txt')
targets_row = response.read()
# Create DataFrame
targets = pd.DataFrame([t for t in targets_row.splitlines()], columns=["Target"])
print("First 20 entries in the targets list")
targets[:20]
First 20 entries in the targets list
Target | |
---|---|
0 | http://www.altpress.org/ |
1 | http://www.nzfortress.co.nz |
2 | http://www.evillasforsale.com |
3 | http://www.playingenemy.com/ |
4 | http://www.richardsonscharts.com |
5 | http://www.xenith.net |
6 | http://www.tdbrecords.com |
7 | http://www.electrichumanproject.com/ |
8 | http://tweekerchick.blogspot.com/ |
9 | http://www.besound.com/pushead/home.html |
10 | http://www.porkchopscreenprinting.com/ |
11 | http://www.kinseyvisual.com |
12 | http://www.rathergood.com |
13 | http://www.lepoint.fr/ |
14 | http://www.revhq.com |
15 | http://www.poprocksandcoke.com |
16 | http://www.samuraiblue.com/ |
17 | http://www.openbsd.org/cgi-bin/man.cgi |
18 | http://www.sysblog.com |
19 | http://www.voicesofsafety.com |
Now I'll split the URLs in several parts:
- schema (HTTP, FTP, SSH etc.)
- host
- port
- path
- query (?q=somevalue etc.)
# Join root domain + suffix
extract_root_domain = lambda x: '.'.join(tldextract.extract(x)[1:3])
target_columns = ['scheme', 'userinfo', 'host', 'port', 'path', 'query', 'fragment', 'decoded']
target_component = [list(URL(t)) for t in targets['Target']]
df_targets = pd.DataFrame(target_component, columns=target_columns)
empty_hosts = df_targets[df_targets['host'] == '']
# Copy path information to host
for index,row in empty_hosts.iterrows():
df_targets.ix[index:index]['host'] = df_targets.ix[index:index]['path']
df_targets.ix[index:index]['path'] = ''
# Extract root tld
df_targets['root_domain'] = df_targets['host'].apply(extract_root_domain)
# Drop unnecessary columns
df_targets.drop(['query', 'fragment', 'decoded'], axis=1, inplace=True)
# Write df to file (for later use)
df_targets.to_csv("targets_df.csv", sep="\t")
print("First 20 Entries")
df_targets[:20]
First 20 Entries
scheme | userinfo | host | port | path | root_domain | |
---|---|---|---|---|---|---|
0 | http | www.altpress.org | / | altpress.org | ||
1 | http | www.nzfortress.co.nz | nzfortress.co.nz | |||
2 | http | www.evillasforsale.com | evillasforsale.com | |||
3 | http | www.playingenemy.com | / | playingenemy.com | ||
4 | http | www.richardsonscharts.com | richardsonscharts.com | |||
5 | http | www.xenith.net | xenith.net | |||
6 | http | www.tdbrecords.com | tdbrecords.com | |||
7 | http | www.electrichumanproject.com | / | electrichumanproject.com | ||
8 | http | tweekerchick.blogspot.com | / | tweekerchick.blogspot.com | ||
9 | http | www.besound.com | /pushead/home.html | besound.com | ||
10 | http | www.porkchopscreenprinting.com | / | porkchopscreenprinting.com | ||
11 | http | www.kinseyvisual.com | kinseyvisual.com | |||
12 | http | www.rathergood.com | rathergood.com | |||
13 | http | www.lepoint.fr | / | lepoint.fr | ||
14 | http | www.revhq.com | revhq.com | |||
15 | http | www.poprocksandcoke.com | poprocksandcoke.com | |||
16 | http | www.samuraiblue.com | / | samuraiblue.com | ||
17 | http | www.openbsd.org | /cgi-bin/man.cgi | openbsd.org | ||
18 | http | www.sysblog.com | sysblog.com | |||
19 | http | www.voicesofsafety.com | voicesofsafety.com |
Whois¶
Now get WHOIS information based on data in df_targets
:
%%bash
if [ ! -d "WHOIS" ]; then
mkdir WHOIS
fi
# Get unique values
uniq_roots = df_targets['root_domain'].unique()
uniq_subdomains = df_targets['host'].unique()
def date_handler(obj):
return obj.isoformat() if hasattr(obj, 'isoformat') else obj
target_whois = {}
def fetch_whois(domains):
""" Fetch WHOIS information for specified domains (list) """
for d in domains:
print("Get WHOIS for\t %s ..." % d)
# Check if file already exists
if os.path.isfile("WHOIS/%s.json" % d):
print("File exists already. Aborting.")
continue
try:
# Get whois information
whois_data = pythonwhois.get_whois(d)
# Convert to JSON$
json_data = json.dumps(whois_data, default=date_handler)
# Write contents to file
with open('WHOIS/%s.json' % d, 'w') as outfile:
json.dump(json_data, outfile)
# Sleep for 20s
time.sleep(20)
except:
print("[ERROR] Couldn't retrieve WHOIS for\t %s" % d)
# I'll only fetch the root domains and only the first 20. Feel free to uncomment this
# and adapt it to your needs.
#fetch_whois(uniq_subdomains)
fetch_whois(uniq_roots[:20])
Get WHOIS for altpress.org ... Get WHOIS for nzfortress.co.nz ... Get WHOIS for evillasforsale.com ... Get WHOIS for playingenemy.com ... Get WHOIS for richardsonscharts.com ... Get WHOIS for xenith.net ... Get WHOIS for tdbrecords.com ... Get WHOIS for electrichumanproject.com ... Get WHOIS for tweekerchick.blogspot.com ... Get WHOIS for besound.com ... Get WHOIS for porkchopscreenprinting.com ... Get WHOIS for kinseyvisual.com ... Get WHOIS for rathergood.com ... Get WHOIS for lepoint.fr ... Get WHOIS for revhq.com ... Get WHOIS for poprocksandcoke.com ... Get WHOIS for samuraiblue.com ... Get WHOIS for openbsd.org ... Get WHOIS for sysblog.com ... Get WHOIS for voicesofsafety.com ...
Get all DNS records¶
%%bash
if [ ! -d "DNS" ]; then
mkdir DNS
fi
def fetch_dns(domains):
""" Fetch all DNS records for specified domains (list) """
for d in domains:
print("Dig DNS records for\t %s ..." % d)
# Check if file already exists
if os.path.isfile("DNS/%s.txt" % d):
print("File exists already. Aborting.")
continue
# Get DNS info
dig_data = !dig +nocmd $d any +multiline +noall +answer
dig_output = "\n".join(dig_data)
# Write contents to file
with open('DNS/%s.txt' % d, 'w') as outfile:
outfile.write(dig_output)
outfile.close()
time.sleep(5)
# I'll only fetch the root domains and only the first 20. Feel free to uncomment this
# and adapt it to your needs.
#fetch_dns(uniq_subdomains)
fetch_dns(uniq_roots[:20])
Dig DNS records for altpress.org ... Dig DNS records for nzfortress.co.nz ... Dig DNS records for evillasforsale.com ... Dig DNS records for playingenemy.com ... Dig DNS records for richardsonscharts.com ... Dig DNS records for xenith.net ... Dig DNS records for tdbrecords.com ... Dig DNS records for electrichumanproject.com ... Dig DNS records for tweekerchick.blogspot.com ... Dig DNS records for besound.com ... Dig DNS records for porkchopscreenprinting.com ... Dig DNS records for kinseyvisual.com ... Dig DNS records for rathergood.com ... Dig DNS records for lepoint.fr ... Dig DNS records for revhq.com ... Dig DNS records for poprocksandcoke.com ... Dig DNS records for samuraiblue.com ... Dig DNS records for openbsd.org ... Dig DNS records for sysblog.com ... Dig DNS records for voicesofsafety.com ...
Read WHOIS information¶
After collecting the data I'll try to manipulate data in a pythonic way in order to export it later to some useful format like Excel. I'll therefor read the collected data from every single file, merge the data and create a DataFrame
.
from pprint import pprint
# Global DF frames
frames = []
def read_whois(domains):
for d in domains:
print("Reading WHOIS for\t %s" % d)
try:
with open('WHOIS/%s.json' % d, 'r') as inputfile:
whois = json.loads(json.load(inputfile))
# Delete raw record
whois.pop('raw', None)
data = []
# Iterate contacts -> tech
if whois['contacts']['tech']:
for i in whois['contacts']['tech']:
data.append([d, 'contacts', 'tech', i, whois['contacts']['tech'][i]])
# Iterate contacts -> admin
if whois['contacts']['admin']:
for i in whois['contacts']['admin']:
data.append([d, 'contacts', 'admin', i, whois['contacts']['admin'][i]])
# Nameservers
if "nameservers" in whois:
for i in whois['nameservers']:
data.append([d, 'nameservers', '', '', i])
# Create DF only if data is not empty
if data:
df = pd.DataFrame(data, columns=['domain', 'element', 'type', 'field', 'value'])
frames.append(df)
# Close file
inputfile.close()
except:
print("[ERROR] Couldn't read WHOIS for\t %s" % d)
#read_whois(uniq_subdomains)
read_whois(uniq_roots[:20])
Reading WHOIS for altpress.org Reading WHOIS for nzfortress.co.nz Reading WHOIS for evillasforsale.com Reading WHOIS for playingenemy.com Reading WHOIS for richardsonscharts.com Reading WHOIS for xenith.net Reading WHOIS for tdbrecords.com Reading WHOIS for electrichumanproject.com Reading WHOIS for tweekerchick.blogspot.com Reading WHOIS for besound.com Reading WHOIS for porkchopscreenprinting.com Reading WHOIS for kinseyvisual.com Reading WHOIS for rathergood.com Reading WHOIS for lepoint.fr Reading WHOIS for revhq.com Reading WHOIS for poprocksandcoke.com Reading WHOIS for samuraiblue.com Reading WHOIS for openbsd.org Reading WHOIS for sysblog.com Reading WHOIS for voicesofsafety.com
df_whois = pd.concat(frames)
df_whois.set_index(['domain', 'element', 'type', 'field'])
value | ||||
---|---|---|---|---|
domain | element | type | field | |
altpress.org | contacts | tech | city | Baltimore |
handle | AB10045-GANDI | |||
name | a.h.s. boy | |||
country | US | |||
phone | +1.4102358565 | |||
state | MD | |||
street | 2710 N. Calvert St | |||
postalcode | 21218 | |||
organization | dada typo | |||
[email protected] | ||||
admin | city | Baltimore | ||
handle | AB10045-GANDI | |||
name | a.h.s. boy | |||
country | US | |||
phone | +1.4102358565 | |||
state | MD | |||
street | 2710 N. Calvert St | |||
postalcode | 21218 | |||
organization | dada typo | |||
[email protected] | ||||
nameservers | DNS.NOTHINGNESS.ORG | |||
DNS.DADATYPO.NET | ||||
evillasforsale.com | contacts | tech | city | Manchester |
name | Andy Deakin | |||
country | GB | |||
phone | +44.1616605550 | |||
state | Greater Manchester | |||
street | 66 Grosvenor St Denton | |||
postalcode | M34 3GA | |||
organization | PCmend.net Computer Solutions Limited | |||
[email protected] | ||||
admin | city | Manchester | ||
name | Andy Deakin | |||
country | GB | |||
phone | +44.1616605550 | |||
state | Greater Manchester | |||
street | 66 Grosvenor St Denton | |||
postalcode | M34 3GA | |||
organization | PCmend.net Computer Solutions Limited | |||
[email protected] | ||||
nameservers | NS1.PCMEND.NET | |||
NS2.PCMEND.NET | ||||
playingenemy.com | nameservers | ns04.a2z-server.jp | ||
dns04.a2z-server.jp | ||||
richardsonscharts.com | contacts | tech | city | New Bedford |
fax | +1.5089926604 | |||
name | Garrity, Christopher | |||
country | US | |||
phone | +1.8888396604 | |||
state | MA | |||
street | 90 Hatch Street, 1st Floor | |||
postalcode | 02745 | |||
organization | null | |||
[email protected] | ||||
admin | city | New Bedford | ||
fax | +1.5089926604 | |||
name | Estes, Lee | |||
country | US | |||
phone | +1.8888396604 | |||
state | MA | |||
street | 90 Hatch Street, 1st Floor | |||
postalcode | 02745 | |||
organization | null | |||
[email protected] | ||||
nameservers | NS2.TERENCENET.NET | |||
NS.TERENCENET.NET | ||||
xenith.net | contacts | tech | city | PALM SPRINGS |
fax | +1.7603255504 | |||
name | DNS Admin | |||
country | US | |||
phone | +1.7603254755 | |||
state | CA | |||
street | 1001 S PALM CANYON DR STE 217 | |||
postalcode | 92264-8349 | |||
organization | DNS Admin | |||
[email protected] | ||||
admin | city | San Luis Obispo | ||
fax | +1.7345724470 | |||
name | Phelan, Kelly | |||
country | US | |||
phone | +1.7349456066 | |||
state | CA | |||
street | 777 Mill St Apt 6 | |||
postalcode | 93401 | |||
organization | null | |||
[email protected] | ||||
nameservers | NS2.WEST-DATACENTER.NET | |||
NS1.WEST-DATACENTER.NET | ||||
tdbrecords.com | contacts | tech | city | Boston |
name | Jonah Livingston | |||
country | United States | |||
phone | 6172308529 | |||
state | Massachusetts | |||
street | 902 Huntington ave | |||
postalcode | 02115 | |||
organization | TDB Records | |||
[email protected] | ||||
admin | city | Boston | ||
name | Jonah Livingston | |||
country | United States | |||
phone | 6172308529 | |||
state | Massachusetts | |||
street | 902 Huntington ave | |||
postalcode | 02115 | |||
organization | TDB Records | |||
[email protected] | ||||
nameservers | NS1.DREAMHOST.COM | |||
NS2.DREAMHOST.COM | ||||
NS3.DREAMHOST.COM | ||||
electrichumanproject.com | contacts | tech | city | Tsukuba |
name | 840Domains Tsukuba 840Domains | |||
country | Japan | |||
phone | +81.5055349763 | |||
state | Ibaraki | |||
street | Baien 2-1-15\nSupuringutekku Tsukuba bld. 401 | |||
postalcode | 305-0045 | |||
organization | Tsukuba | |||
[email protected] | ||||
admin | city | Tsukuba | ||
name | 840Domains Tsukuba 840Domains | |||
country | Japan | |||
phone | +81.5055349763 | |||
state | Ibaraki | |||
street | Baien 2-1-15\nSupuringutekku Tsukuba bld. 401 | |||
postalcode | 305-0045 | |||
organization | Tsukuba | |||
[email protected] | ||||
nameservers | SNS41.WEBSITEWELCOME.COM | |||
SNS42.WEBSITEWELCOME.COM | ||||
besound.com | contacts | tech | city | San Diego |
fax | 858-450-0567 | |||
country | United States | |||
phone | 858-458-0490 | |||
state | California | |||
street | 5266 Eastgate Mall | |||
postalcode | 92121 | |||
organization | A+Net | |||
[email protected] | ||||
admin | city | LINDENHURST | ||
fax | 999 999 9999 | |||
name | Richard Lopez | |||
country | United States | |||
phone | (516) 226-8430 | |||
state | New York | |||
street | 180 34TH ST | |||
postalcode | 11757-3243 | |||
organization | BeSound Multimedia | |||
[email protected] | ||||
nameservers | BDNS.CV.SITEPROTECT.COM | |||
ADNS.CV.SITEPROTECT.COM | ||||
porkchopscreenprinting.com | contacts | tech | city | New York |
name | Domain Registrar | |||
country | US | |||
phone | +1.9027492701 | |||
state | NY | |||
street | 575 8th Avenue 11th Floor | |||
postalcode | 10018 | |||
organization | Register.Com | |||
admin | city | Seattle | ||
name | Damon Baldwin | |||
country | US | |||
phone | +1.2067064764 | |||
state | WA | |||
street | 9218 9th ave NW | |||
postalcode | 98117 | |||
organization | Pork Chop Screen Printing | |||
nameservers | ns1.hosting-advantage.com | |||
ns2.hosting-advantage.com | ||||
kinseyvisual.com | contacts | tech | city | Culver City |
fax | +1.8186498230 | |||
name | ADMINISTRATOR, DOMAIN | |||
country | US | |||
phone | +1.8775784000 | |||
state | CA | |||
street | 8520 National Blvd. #A | |||
postalcode | 90232 | |||
organization | Media Temple | |||
[email protected] | ||||
admin | city | SAN DIEGO | ||
fax | +1.6195449594 | |||
name | Kinsey, Dave | |||
country | US | |||
phone | +1.6195449595 | |||
state | CA | |||
street | 705 12TH AVE | |||
postalcode | 92101-6507 | |||
organization | BlkMkrt Inc. | |||
[email protected] | ||||
nameservers | NS1.MEDIATEMPLE.NET | |||
NS2.MEDIATEMPLE.NET | ||||
rathergood.com | contacts | tech | city | London |
fax | +1.9999999999 | |||
name | Veitch, Joel | |||
country | UK | |||
phone | +1.08072547734 | |||
state | null | |||
street | 10 Croston Street | |||
postalcode | null | |||
organization | null | |||
[email protected] | ||||
admin | city | London | ||
fax | +1.9999999999 | |||
name | Veitch, Joel | |||
country | UK | |||
phone | +1.08072547734 | |||
state | null | |||
street | 10 Croston Street | |||
postalcode | null | |||
organization | null | |||
[email protected] | ||||
nameservers | NS1.DREAMHOST.COM | |||
NS3.DREAMHOST.COM | ||||
NS2.DREAMHOST.COM | ||||
lepoint.fr | contacts | tech | city | Paris |
handle | GR283-FRNIC | |||
name | GANDI ROLE | |||
country | FR | |||
street | Gandi\n15, place de la Nation | |||
postalcode | 75011 | |||
type | ROLE | |||
[email protected] | ||||
changedate | 2006-03-03T00:00:00 | |||
admin | city | Paris | ||
handle | SDED175-FRNIC | |||
name | SOCIETE D'EXPLOITATION DE L'HEBDOMADAIRE LE POINT | |||
country | FR | |||
phone | +33 1 44 10 10 10 | |||
street | 74, avenue du maine | |||
postalcode | 75014 | |||
type | ORGANIZATION | |||
[email protected] | ||||
changedate | 2013-07-10T00:00:00 | |||
nameservers | b.dns.gandi.net | |||
a.dns.gandi.net | ||||
c.dns.gandi.net | ||||
revhq.com | contacts | tech | city | HUNTINGTON BEACH |
fax | +1.5555555555 | |||
name | JORDAN COOPER | |||
country | US | |||
phone | +1.7148427584 | |||
state | CA | |||
street | P.O. BOX 5232 | |||
postalcode | 92615 | |||
organization | REV DISTRIBUTION | |||
[email protected] | ||||
admin | city | HUNTINGTON BEACH | ||
fax | +1.5555555555 | |||
name | JORDAN COOPER | |||
country | US | |||
phone | +1.7148427584 | |||
state | CA | |||
street | P.O. BOX 5232 | |||
postalcode | 92615 | |||
organization | REV DISTRIBUTION | |||
[email protected] | ||||
nameservers | NS1.CLOUDNS.NET | |||
NS2.CLOUDNS.NET | ||||
NS3.CLOUDNS.NET | ||||
NS4.CLOUDNS.NET | ||||
poprocksandcoke.com | contacts | tech | city | Ljubljana |
name | Matija Zajec | |||
country | Slovenia | |||
phone | +386.30363699 | |||
state | Osrednjeslovenska | |||
street | Krizevniska ulica 7 | |||
postalcode | 1000 | |||
[email protected] | ||||
admin | city | Ljubljana | ||
name | Matija Zajec | |||
country | Slovenia | |||
phone | +386.30363699 | |||
state | Osrednjeslovenska | |||
street | Krizevniska ulica 7 | |||
postalcode | 1000 | |||
[email protected] | ||||
nameservers | NS3.WEBDNS.PW | |||
NS4.WEBDNS.PW | ||||
samuraiblue.com | contacts | tech | city | Louisville |
fax | +1.5025692774 | |||
name | MaximumASP, LLC | |||
country | US | |||
phone | +1.5025692771 | |||
state | KY | |||
street | 540 Baxter Avenue | |||
postalcode | 40204 | |||
organization | MaximumASP, LLC | |||
[email protected] | ||||
admin | city | Tampa | ||
fax | +1.9999999999 | |||
name | Meronek, Rob | |||
country | US | |||
phone | +1.838575819 | |||
state | FL | |||
street | 777 North Ashley Drive #1212 | |||
postalcode | 33602 | |||
organization | The Boardr | |||
[email protected] | ||||
nameservers | DNS1.MIDPHASE.COM | |||
DNS2.MIDPHASE.COM | ||||
openbsd.org | contacts | tech | city | Calgary Alberta |
handle | CR32086106 | |||
name | Theos Software | |||
country | CA | |||
phone | +1.40323798 | |||
state | Alberta | |||
street | 812 23rd ave SE | |||
postalcode | T2G 1N8 | |||
organization | Theos Software | |||
[email protected] | ||||
admin | city | Calgary | ||
handle | CR32086107 | |||
name | Theo de Raadt | |||
country | CA | |||
phone | +1.4032379834 | |||
state | Alberta | |||
street | 812 23rd Ave SE | |||
postalcode | T2G1N8 | |||
organization | Theos Software | |||
[email protected] | ||||
nameservers | NS1.TELSTRA.NET | |||
NS.SIGMASOFT.COM | ||||
NS1.SUPERBLOCK.NET | ||||
NS2.SUPERBLOCK.NET | ||||
ZEUS.THEOS.COM | ||||
C.NS.BSWS.DE | ||||
A.NS.BSWS.DE | ||||
sysblog.com | contacts | tech | city | Waltham |
fax | +1.7818392801 | |||
name | Toll Free: 866-822-9073 Worldwide: 339-222-5132 This Domain For Sale | |||
country | US | |||
phone | +1.8668229073 | |||
state | MA | |||
street | 738 Main Street #389 | |||
postalcode | 02451 | |||
organization | BuyDomains.com | |||
[email protected] | ||||
admin | city | Waltham | ||
fax | +1.7818392801 | |||
name | Toll Free: 866-822-9073 Worldwide: 339-222-5132 This Domain For Sale | |||
country | US | |||
phone | +1.8668229073 | |||
state | MA | |||
street | 738 Main Street #389 | |||
postalcode | 02451 | |||
organization | BuyDomains.com | |||
[email protected] | ||||
nameservers | NS.BUYDOMAINS.COM | |||
THIS-DOMAIN-FOR-SALE.COM | ||||
voicesofsafety.com | contacts | tech | city | Burlington |
fax | +1.782722915 | |||
name | BizLand.com, Inc. | |||
country | US | |||
phone | +1.782725585 | |||
state | MA | |||
street | 121 Middlesex Turnpike | |||
postalcode | 01803 | |||
organization | BizLand.com, Inc. | |||
[email protected] | ||||
admin | city | NORTHE COLDWELL | ||
fax | +1.9732280276 | |||
name | VOICESOFSAFTY INT'L | |||
country | US | |||
phone | +1.9732282258 | |||
state | NJ | |||
street | 264 park ave | |||
postalcode | 07006 | |||
organization | VOICESOFSAFTY INT'L | |||
[email protected] | ||||
nameservers | CLICKME2.CLICK2SITE.COM | |||
CLICKME.CLICK2SITE.COM |
Read DNS information¶
Do the same with the DNS files...
from pprint import pprint
import re
import traceback
# Global DF frames
frames = []
def read_dns(domains):
for d in domains:
print("Reading WHOIS for\t %s" % d)
data = []
try:
with open('DNS/%s.txt' % d, 'r') as inputfile:
dns = inputfile.read()
for l in dns.splitlines():
records = l.split()
# Check only for NS, MX, A, CNAME, TXT
a = re.compile("^(NS|MX|A|CNAME|TXT)$")
if len(records) >= 4:
if a.match(records[3]):
data.append([d, records[3], records[4]])
# Create DF only if data is not empty
if data:
df = pd.DataFrame(data, columns=['domain', 'dns_record', 'value'])
frames.append(df)
# Close file
inputfile.close()
except Exception, err:
print("[ERROR] Couldn't read WHOIS for\t %s" % d)
traceback.print_exc()
#read_dns(uniq_subdomains)
read_dns(uniq_roots[:20])
Reading WHOIS for altpress.org Reading WHOIS for nzfortress.co.nz Reading WHOIS for evillasforsale.com Reading WHOIS for playingenemy.com Reading WHOIS for richardsonscharts.com Reading WHOIS for xenith.net Reading WHOIS for tdbrecords.com Reading WHOIS for electrichumanproject.com Reading WHOIS for tweekerchick.blogspot.com Reading WHOIS for besound.com Reading WHOIS for porkchopscreenprinting.com Reading WHOIS for kinseyvisual.com Reading WHOIS for rathergood.com Reading WHOIS for lepoint.fr Reading WHOIS for revhq.com Reading WHOIS for poprocksandcoke.com Reading WHOIS for samuraiblue.com Reading WHOIS for openbsd.org Reading WHOIS for sysblog.com Reading WHOIS for voicesofsafety.com
df_dns = pd.concat(frames)
df_dns.set_index(['domain', 'dns_record'])
value | ||
---|---|---|
domain | dns_record | |
altpress.org | NS | dns.dadatypo.net. |
NS | dns.nothingness.org. | |
nzfortress.co.nz | NS | ns-1637.awsdns-12.co.uk. |
NS | ns-913.awsdns-50.net. | |
NS | ns-203.awsdns-25.com. | |
NS | ns-1284.awsdns-32.org. | |
evillasforsale.com | NS | ns2.pcmend.net. |
NS | ns1.pcmend.net. | |
playingenemy.com | NS | dns04.a2z-server.jp. |
NS | ns04.a2z-server.jp. | |
richardsonscharts.com | NS | ns2.interbasix.net. |
A | 207.97.239.35 | |
MX | 10 | |
TXT | "v=spf1 | |
NS | ns.interbasix.net. | |
MX | 30 | |
MX | 40 | |
MX | 20 | |
xenith.net | NS | ns1.west-datacenter.net. |
NS | ns2.west-datacenter.net. | |
A | 206.130.121.98 | |
MX | 10 | |
tdbrecords.com | NS | ns2.dreamhost.com. |
NS | ns1.dreamhost.com. | |
MX | 0 | |
NS | ns3.dreamhost.com. | |
MX | 0 | |
A | 75.119.220.89 | |
electrichumanproject.com | NS | sns41.websitewelcome.com. |
NS | sns42.websitewelcome.com. | |
A | 67.18.68.14 | |
tweekerchick.blogspot.com | CNAME | blogspot.l.googleusercontent.com. |
A | 173.194.44.10 | |
A | 173.194.44.12 | |
A | 173.194.44.11 | |
besound.com | NS | bdns.cv.siteprotect.com. |
NS | adns.cv.siteprotect.com. | |
porkchopscreenprinting.com | NS | ns1.hosting-advantage.com. |
NS | ns2.hosting-advantage.com. | |
A | 64.92.121.42 | |
MX | 5 | |
kinseyvisual.com | A | 205.186.183.161 |
NS | ns1.mediatemple.net. | |
MX | 10 | |
NS | ns2.mediatemple.net. | |
rathergood.com | MX | 0 |
NS | ns2.dreamhost.com. | |
NS | ns1.dreamhost.com. | |
MX | 0 | |
NS | ns3.dreamhost.com. | |
A | 64.90.57.150 | |
lepoint.fr | NS | c.dns.gandi.net. |
NS | b.dns.gandi.net. | |
NS | a.dns.gandi.net. | |
revhq.com | NS | ns1.cloudns.net. |
NS | ns4.cloudns.net. | |
NS | ns3.cloudns.net. | |
NS | ns2.cloudns.net. | |
poprocksandcoke.com | A | 184.164.147.132 |
MX | 0 | |
NS | ns3.webdns.pw. | |
NS | ns4.webdns.pw. | |
samuraiblue.com | NS | dns1.anhosting.com. |
NS | dns2.anhosting.com. | |
MX | 0 | |
TXT | "v=spf1 | |
A | 174.127.110.249 | |
openbsd.org | NS | c.ns.bsws.de. |
NS | ns2.superblock.net. | |
A | 129.128.5.194 | |
NS | a.ns.bsws.de. | |
NS | ns1.superblock.net. | |
NS | ns.sigmasoft.com. | |
NS | ns1.telstra.net. | |
NS | zeus.theos.com. | |
MX | 10 | |
MX | 6 | |
sysblog.com | MX | 0 |
A | 66.151.181.49 | |
TXT | "v=spf1 | |
voicesofsafety.com | NS | clickme.click2site.com. |
NS | clickme2.click2site.com. |
Connect to targets¶
For every single target I'll connect to it per HTTP(s) using urllib2
and store the HTTP headers.
import urllib2
import httplib
c_targets = [t for t in targets['Target'][:20]]
frames = []
# Collect here all URLs failed to connect to
error_urls = []
def send_request(target, data):
""" Sends a single request to the target """
# Set own headers
headers = {'User-Agent' : 'Mozilla 5.10'}
# Create request
request = urllib2.Request(target, None, headers)
# Default response
response = None
try:
# Send request
response = urllib2.urlopen(request, timeout=5)
# Add headers
for h in response.info():
data.append([target, response.code, h, response.info()[h]])
except urllib2.HTTPError, e:
print('[ERROR] HTTPError = ' + str(e.code))
data.append([target, e.code, '', ''])
except urllib2.URLError, e:
print('[ERROR] URLError = ' + str(e.reason))
data.append([target, e.reason, '', ''])
except ValueError, e:
# Most probably the target didn't have any schema
# So send the request again with HTTP
error_urls.append(target)
print('[ERROR] ValueError = ' + e.message)
except httplib.HTTPException, e:
print('[ERROR] HTTPException')
except Exception:
import traceback
print('[ERROR] Exception: ' + traceback.format_exc())
finally:
return response
def open_connection(targets):
""" Iterate through targets and send requests """
data = []
for t in targets:
print("Connecting to\t %s" % t)
response = send_request(t, data)
# Create DF only if data is not empty
if data:
df = pd.DataFrame(data, columns=['url', 'response', 'header', 'value'])
frames.append(df)
# Open connection to targets and collect information
open_connection(c_targets)
# If there are any urls not having been tested, then
# prepend http:// to <target> and run again
new_targets = ["http://"+u for u in error_urls]
open_connection(new_targets)
Connecting to http://www.altpress.org/ Connecting to http://www.nzfortress.co.nz Connecting to http://www.evillasforsale.com Connecting to http://www.playingenemy.com/ [ERROR] URLError = timed out Connecting to http://www.richardsonscharts.com Connecting to http://www.xenith.net [ERROR] Exception: Traceback (most recent call last): File "<ipython-input-19-d057092f77b5>", line 26, in send_request response = urllib2.urlopen(request, timeout=5) File "/usr/lib/python2.7/urllib2.py", line 127, in urlopen return _opener.open(url, data, timeout) File "/usr/lib/python2.7/urllib2.py", line 401, in open response = self._open(req, data) File "/usr/lib/python2.7/urllib2.py", line 419, in _open '_open', req) File "/usr/lib/python2.7/urllib2.py", line 379, in _call_chain result = func(*args) File "/usr/lib/python2.7/urllib2.py", line 1211, in http_open return self.do_open(httplib.HTTPConnection, req) File "/usr/lib/python2.7/urllib2.py", line 1184, in do_open r = h.getresponse(buffering=True) File "/usr/lib/python2.7/httplib.py", line 1034, in getresponse response.begin() File "/usr/lib/python2.7/httplib.py", line 407, in begin version, status, reason = self._read_status() File "/usr/lib/python2.7/httplib.py", line 365, in _read_status line = self.fp.readline() File "/usr/lib/python2.7/socket.py", line 447, in readline data = self._sock.recv(self._rbufsize) timeout: timed out Connecting to http://www.tdbrecords.com Connecting to http://www.electrichumanproject.com/ Connecting to http://tweekerchick.blogspot.com/ Connecting to http://www.besound.com/pushead/home.html Connecting to http://www.porkchopscreenprinting.com/ Connecting to http://www.kinseyvisual.com Connecting to http://www.rathergood.com Connecting to http://www.lepoint.fr/ Connecting to http://www.revhq.com Connecting to http://www.poprocksandcoke.com Connecting to http://www.samuraiblue.com/ Connecting to http://www.openbsd.org/cgi-bin/man.cgi Connecting to http://www.sysblog.com Connecting to http://www.voicesofsafety.com
df_connection = pd.concat(frames)
df_connection.set_index(['url', 'response', 'header'])
value | |||
---|---|---|---|
url | response | header | |
http://www.altpress.org/ | 200 | content-length | 24576 |
x-powered-by | PHP/5.2.4-2ubuntu5.27 | ||
set-cookie | PHPSESSID=1498f60d82d31ec081debde379e605eb; path=/ | ||
expires | Thu, 19 Nov 1981 08:52:00 GMT | ||
vary | Accept-Encoding | ||
server | Apache/2.2.8 (Ubuntu) PHP/5.2.4-2ubuntu5.27 with Suhosin-Patch mod_ssl/2.2.8 OpenSSL/0.9.8g | ||
last-modified | Wed, 06 Aug 2014 11:42:08 GMT | ||
connection | close | ||
etag | "8ea9fc88e045b56cd96e6fc8b487cbd9" | ||
pragma | no-cache | ||
cache-control | public,must-revalidate | ||
date | Wed, 06 Aug 2014 11:44:55 GMT | ||
content-type | text/html; charset=utf-8 | ||
http://www.nzfortress.co.nz | 200 | x-powered-by | PHP/5.3.10-1ubuntu3.6 |
transfer-encoding | chunked | ||
set-cookie | bblastvisit=1407325495; expires=Thu, 06-Aug-2015 11:44:55 GMT; path=/, bblastactivity=0; expires... | ||
vary | Accept-Encoding,User-Agent | ||
server | Apache/2.2.22 (Ubuntu) | ||
connection | close | ||
x-ua-compatible | IE=7 | ||
pragma | private | ||
cache-control | private | ||
date | Wed, 06 Aug 2014 11:44:55 GMT | ||
content-type | text/html; charset=ISO-8859-1 | ||
http://www.evillasforsale.com | 200 | content-length | 14610 |
accept-ranges | bytes | ||
vary | Accept-Encoding,User-Agent | ||
server | Apache/2 | ||
last-modified | Thu, 21 Jan 2010 13:33:43 GMT | ||
connection | close | ||
etag | "2040cf7-3912-47dacc06c1bc0" | ||
date | Wed, 06 Aug 2014 11:46:01 GMT | ||
content-type | text/html | ||
http://www.playingenemy.com/ | timed out | ||
http://www.richardsonscharts.com | 200 | x-powered-by | PleskLin |
transfer-encoding | chunked | ||
set-cookie | PHPSESSID=8cg77frbg8biv0ru8m7udb6877; path=/ | ||
expires | Thu, 19 Nov 1981 08:52:00 GMT | ||
server | Apache | ||
connection | close | ||
pragma | no-cache | ||
cache-control | no-store, no-cache, must-revalidate, post-check=0, pre-check=0 | ||
date | Wed, 06 Aug 2014 11:45:00 GMT | ||
content-type | text/html | ||
http://www.tdbrecords.com | 200 | content-length | 2600 |
accept-ranges | bytes | ||
vary | Accept-Encoding | ||
server | Apache | ||
last-modified | Mon, 03 Oct 2011 00:02:54 GMT | ||
connection | close | ||
etag | "a28-4ae59b253c780" | ||
date | Wed, 06 Aug 2014 11:46:45 GMT | ||
content-type | text/html | ||
http://www.electrichumanproject.com/ | 200 | content-length | 14683 |
accept-ranges | bytes | ||
vary | Accept-Encoding | ||
server | Apache | ||
last-modified | Tue, 05 Aug 2014 18:19:00 GMT | ||
connection | close | ||
date | Wed, 06 Aug 2014 11:45:06 GMT | ||
content-type | text/html | ||
http://tweekerchick.blogspot.com/ | 200 | alternate-protocol | 80:quic |
x-xss-protection | 1; mode=block | ||
x-content-type-options | nosniff | ||
expires | Wed, 06 Aug 2014 11:45:06 GMT | ||
server | GSE | ||
last-modified | Wed, 06 Aug 2014 05:34:08 GMT | ||
connection | close | ||
etag | "d6b75768-8b38-4991-b414-a06cc4608563" | ||
cache-control | private, max-age=0 | ||
date | Wed, 06 Aug 2014 11:45:06 GMT | ||
content-type | text/html; charset=UTF-8 | ||
http://www.besound.com/pushead/home.html | 200 | content-length | 3870 |
accept-ranges | bytes | ||
server | Apache | ||
last-modified | Fri, 09 Jun 2006 04:34:30 GMT | ||
connection | close | ||
etag | "f1e-415c31dd2c180" | ||
date | Wed, 06 Aug 2014 11:45:07 GMT | ||
content-type | text/html | ||
http://www.porkchopscreenprinting.com/ | 200 | content-length | 11811 |
set-cookie | HttpOnly;Secure | ||
accept-ranges | bytes | ||
expires | Wed, 06 Aug 2014 11:45:27 GMT | ||
server | Apache | ||
last-modified | Tue, 28 Aug 2012 17:44:17 GMT | ||
connection | close | ||
etag | "b893e5-2e23-503d0371" | ||
cache-control | max-age=20 | ||
date | Wed, 06 Aug 2014 11:45:07 GMT | ||
content-type | text/html | ||
http://www.kinseyvisual.com | 200 | x-powered-by | PHP/5.3.27 |
transfer-encoding | chunked | ||
set-cookie | PHPSESSID=b5f9f0af80bf4e08f41eeb02be6e6ad1; path=/ | ||
expires | Thu, 19 Nov 1981 08:52:00 GMT | ||
vary | User-Agent,Accept-Encoding | ||
server | Apache/2.2.22 | ||
connection | close | ||
pragma | no-cache | ||
cache-control | no-store, no-cache, must-revalidate, post-check=0, pre-check=0 | ||
date | Wed, 06 Aug 2014 11:45:08 GMT | ||
content-type | text/html | ||
http://www.rathergood.com | 200 | transfer-encoding | chunked |
set-cookie | c6ef959f4780c6a62e86c7a2d2e5ccea=4ilfnp83k67evmmn281i9qcnu3; path=/ | ||
vary | Accept-Encoding | ||
server | Apache | ||
connection | close | ||
pragma | no-cache | ||
cache-control | no-cache, max-age=0, no-cache | ||
date | Wed, 06 Aug 2014 11:45:08 GMT | ||
p3p | CP="NOI ADM DEV PSAi COM NAV OUR OTRo STP IND DEM" | ||
content-type | text/html; charset=utf-8 | ||
x-mod-pagespeed | 1.6.29.7-3566 | ||
http://www.lepoint.fr/ | 200 | x-xss-protection | 1; mode=block |
x-content-type-options | nosniff | ||
x-powered-by | PHP/5.5.9 | ||
transfer-encoding | chunked | ||
vary | User-Agent,Accept-Encoding | ||
server | Apache/2.2.25 (Unix) PHP/5.5.9 | ||
connection | close | ||
date | Wed, 06 Aug 2014 11:45:09 GMT | ||
x-frame-options | SAMEORIGIN | ||
content-type | text/html | ||
http://www.revhq.com | 200 | x-powered-by | Atari TT posix / Python / php 5.3x |
transfer-encoding | chunked | ||
set-cookie | PHPSESSID=e1jmcg9c2pgbi9rhgcdkhq5ge4; path=/ | ||
expires | Thu, 19 Nov 1981 08:52:00 GMT | ||
vary | Accept-Encoding | ||
server | Apache/2.2.22 | ||
connection | close | ||
pragma | no-cache | ||
cache-control | no-store, no-cache, must-revalidate, post-check=0, pre-check=0 | ||
date | Wed, 06 Aug 2014 11:45:19 GMT | ||
content-type | text/html | ||
http://www.poprocksandcoke.com | 200 | x-powered-by | PHP/5.3.24 |
transfer-encoding | chunked | ||
server | Apache | ||
connection | close | ||
date | Wed, 06 Aug 2014 11:45:10 GMT | ||
content-type | text/html; charset=UTF-8 | ||
x-pingback | http://www.poprocksandcoke.com/xmlrpc.php | ||
http://www.samuraiblue.com/ | 200 | content-length | 54005 |
x-powered-by | PHP/5.4.31 | ||
server | Apache | ||
connection | close | ||
date | Wed, 06 Aug 2014 11:45:12 GMT | ||
content-type | text/html; charset=UTF-8 | ||
x-pingback | http://samuraiblue.com/xmlrpc.php | ||
http://www.openbsd.org/cgi-bin/man.cgi | 200 | transfer-encoding | chunked |
server | Apache | ||
connection | close | ||
pragma | no-cache | ||
cache-control | no-cache | ||
date | Wed, 06 Aug 2014 11:45:13 GMT | ||
content-type | text/html; charset=utf-8 | ||
http://www.sysblog.com | 200 | content-length | 48663 |
x-varnish | 718735313 718731229 | ||
x-cache | HIT | ||
x-powered-by | PHP/5.3.16 | ||
set-cookie | PHPSESSID=5vk936712pnke6t5ki26n9frf4; path=/ | ||
accept-ranges | bytes | ||
expires | Thu, 19 Nov 1981 08:52:00 GMT | ||
server | Apache | ||
connection | close | ||
via | 1.1 varnish | ||
pragma | no-cache | ||
cache-control | no-store, no-cache, must-revalidate, post-check=0, pre-check=0 | ||
date | Wed, 06 Aug 2014 11:45:14 GMT | ||
content-type | text/html; charset=UTF-8 | ||
age | 40 | ||
http://www.voicesofsafety.com | 200 | content-length | 20854 |
accept-ranges | bytes, bytes | ||
server | Apache/2 | ||
connection | close | ||
date | Wed, 06 Aug 2014 11:45:15 GMT | ||
content-type | text/html | ||
age | 0 |
Save to Excel¶
Now feel free to do whatever you want with your DataFrames: Export them to CSV, EXCEL, TXT etc.
from pandas import ExcelWriter
writer = ExcelWriter('Excel/output.xls')
df_whois.to_excel(writer, "Sheet - WHOIS")
df_dns.to_excel(writer, "Sheet - DNS")
#df_connection.to_excel(writer, "Sheet - Connections")
Since I wasn't able to export the df_connection
to Excel (Exception: Unexpected data type <class 'socket.timeout'>
) I had to export it to CSV:
df_connection.to_csv("Excel/connection.csv", sep="\t", header=True)