This script lets you detect scam clone websites by scanning a suspicious URL on urlscan.io, extracting all image URLs used on the site, hashing them, and then searching for other websites using the same images. Perfect for detecting mass-produced phishing or fake investment sites that share logos, banners, or background assets.
Built-in GUI lets you manually select which images to use for scanning related sites, so you can avoid useless icons or logos.
Requirements:
- Python 3.8+
- Install dependencies:
pip install requests pillow python-whois
(Linux users may also need: sudo apt install python3-tk)
- Get a free API key from urlscan.io.
- Place it in the script where it says:
[i]API_KEY = "Your urlscan.io API KEY here"[/i]
How It Works:
- You input a Scam Website URL.
- The script submits it to [urlscan.io] for scanning and waits for the result.
- It then:
- Extracts all image URLs used by the page.
- Hashes each image with SHA256.
- Displays a scrollable GUI showing thumbnails + checkboxes.
- You choose which images are unique or likely scam-related (e.g. logos, trading screenshots).
- The script finds other websites using the same image hash via the urlscan.io search API.
- All matches are saved to
[domain]_merged_websites.txt. - The script then:
- Asks you if you want to check the online status of each site [just hit enter].
- Identifies the status of the likely clones (as some might already be down).
- Pulls WHOIS registration dates for active sites (so you know if it’s newly created).
Use Cases:
- Identify clusters of scam websites using reused branding or templates.
- Monitor if a specific scam campaign is expanding.
- Get WHOIS data and uptime status for reports to hosts/registrars.
Output Includes:
- Status: UP / DOWN / PHISHING / SSL ERROR
- Registration dates (from WHOIS)
- List of related scam sites in a .txt file
GUI Preview:
When the GUI opens, you can visually select which images should be used to detect clones. This lets you skip icons or common assets and focus only on identifying scam-specific content.
Note:
- The success of this depends on the urlscan.io database = if a scam site wasn’t scanned there you won’t find it.
- You can test it with the .top sites as well, I focused on investment scam websites (who pop up like herpes so I can find/report them)
Feel free to fork or improve the script. It’s designed with scambaiters in mind: fast, visual, and practical.
Happy hunting!
Script
import os
import time
import requests
import hashlib
import random
import socket
from urllib.parse import urlparse
import tkinter as tk
from tkinter import ttk
from PIL import Image, ImageTk, ImageSequence
import io
from PIL import UnidentifiedImageError
import whois
import datetime
API_KEY = "Your urlscan.io API KEY here"
user_agents = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:42.0) Gecko/20100101 Firefox/42.0",
"Mozilla/5.0 (Linux; Android 10; SM-G960F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Mobile Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:62.0) Gecko/20100101 Firefox/62.0",
"Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0",
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:40.0) Gecko/20100101 Firefox/40.0",
"Mozilla/5.0 (Windows NT 6.1; Trident/7.0; AS; ASBXJS; en-US) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1941.0 Safari/537.36 Edge/12.0"
]
def scan_website(api_key, website_url):
url = "https://urlscan.io/api/v1/scan/"
headers = {'API-Key': api_key, 'Content-Type': 'application/json'}
data = {"url": website_url, "visibility": "public"}
response = requests.post(url, headers=headers, json=data)
if response.status_code == 200:
result = response.json()
scan_uuid = result['uuid']
print(f"[+] Scan submitted. UUID: {scan_uuid}")
return scan_uuid
else:
print(f"[-] Error submitting scan: {response.status_code} - {response.text}")
return None
def wait_for_result(scan_uuid):
result_url = f"https://urlscan.io/api/v1/result/{scan_uuid}/"
while True:
response = requests.get(result_url)
if response.status_code == 200:
print("[+] Scan completed.")
return response.json()
elif response.status_code == 404:
print("[+] Scan still processing... Waiting 5s")
time.sleep(5)
else:
print(f"[-] Error retrieving scan: {response.status_code}")
return None
def hash_image_from_url(url):
try:
response = requests.get(url, timeout=10)
if response.status_code == 200:
sha256_hash = hashlib.sha256(response.content).hexdigest()
return sha256_hash
except Exception:
return None
def extract_images_and_hashes(json_data):
all_urls = json_data.get('lists', {}).get('urls', [])
image_extensions = ('.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp', '.svg', '.ico')
image_url_hash_pairs = []
for url in all_urls:
if any(url.lower().endswith(ext) for ext in image_extensions):
hash_val = hash_image_from_url(url)
image_url_hash_pairs.append((url, hash_val if hash_val else '[NO HASH FOUND]'))
return image_url_hash_pairs
def normalize_url(url):
parsed = urlparse(url.replace("http://", "https://"))
netloc = parsed.netloc.lstrip("www.")
path = parsed.path.rstrip('/')
path_parts = path.split('/')
if len(path_parts) > 1:
normalized = f"https://{netloc}/"
else:
normalized = f"https://{netloc}/"
return normalized
def search_hash_in_urlscan(hash_value, api_key):
search_url = "https://urlscan.io/api/v1/search/"
headers = {'API-Key': api_key, 'Content-Type': 'application/json'}
params = {'q': f"hash:{hash_value}", 'size': 100}
response = requests.get(search_url, headers=headers, params=params)
if response.status_code == 200:
return response.json().get('results', [])
else:
print(f"[-] Error searching hash: {response.status_code}")
return []
def get_websites_for_hash(hash_val, api_key):
results = search_hash_in_urlscan(hash_val, api_key)
unique_sites = set()
for site in results:
site_urls = site.get('lists', {}).get('urls', [])
if site_urls:
for url in site_urls:
normalized = normalize_url(url)
unique_sites.add(normalized)
else:
site_url = site.get('task', {}).get('url', '')
if site_url:
normalized = normalize_url(site_url)
unique_sites.add(normalized)
return sorted(list(unique_sites))[:100]
def open_image_window(image_url_hash_pairs, domain_name, target_url):
import tkinter as tk
from tkinter import ttk
import requests
from PIL import Image, ImageTk
import io
import os
def on_proceed():
selected_images = [url for (var, url) in checkboxes if var.get()]
results = {}
for url in selected_images:
hash_val = next((hash_val for img_url, hash_val in image_url_hash_pairs if img_url == url), None)
if hash_val:
websites = get_websites_for_hash(hash_val, API_KEY)
results[url] = websites
save_results_to_file(results, domain_name, target_url)
window.destroy()
window = tk.Tk()
window.title("Select Relevant Images")
# --- SCROLLABLE CANVAS SETUP ---
canvas = tk.Canvas(window)
v_scrollbar = ttk.Scrollbar(window, orient="vertical", command=canvas.yview)
h_scrollbar = ttk.Scrollbar(window, orient="horizontal", command=canvas.xview)
scrollable_frame = ttk.Frame(canvas)
def on_frame_configure(event):
canvas.configure(scrollregion=canvas.bbox("all"))
scrollable_frame.bind("<Configure>", on_frame_configure)
canvas.create_window((0, 0), window=scrollable_frame, anchor="nw")
canvas.configure(yscrollcommand=v_scrollbar.set, xscrollcommand=h_scrollbar.set)
checkboxes = []
max_images_per_row = 10
img_width = 100
img_height = 100
filtered_pairs = [pair for pair in image_url_hash_pairs if not pair[0].lower().endswith(".svg")]
for idx, (url, hash_val) in enumerate(filtered_pairs):
try:
img_data = requests.get(url, timeout=10).content
img = Image.open(io.BytesIO(img_data))
img.thumbnail((img_width, img_height))
img_tk = ImageTk.PhotoImage(img)
label = tk.Label(scrollable_frame, image=img_tk)
label.image = img_tk
row, col = divmod(idx, max_images_per_row)
label.grid(row=row * 2, column=col)
var = tk.BooleanVar()
# Truncate long filenames
filename = os.path.basename(url)
if len(filename) > 20:
name_part, ext = os.path.splitext(filename)
display_name = name_part[:15] + "...." + ext
else:
display_name = filename
checkbox = tk.Checkbutton(scrollable_frame, text=display_name, variable=var)
checkbox.grid(row=row * 2 + 1, column=col)
checkboxes.append((var, url))
except Exception as e:
print(f"[-] Error loading image from {url}: {e}")
continue
proceed_button = ttk.Button(scrollable_frame, text="Proceed", command=on_proceed)
proceed_button.grid(row=(len(filtered_pairs) // max_images_per_row + 1) * 2, column=0, columnspan=max_images_per_row)
# Pack everything
canvas.grid(row=0, column=0, sticky="nsew")
v_scrollbar.grid(row=0, column=1, sticky="ns")
h_scrollbar.grid(row=1, column=0, sticky="ew")
# Configure grid weights to expand canvas
window.grid_rowconfigure(0, weight=1)
window.grid_columnconfigure(0, weight=1)
# --- Set initial window size ---
width = img_width * max_images_per_row + 50
total_rows = ((len(filtered_pairs) - 1) // max_images_per_row + 1) * 2 + 1
height = img_height * total_rows + 100
max_height = 600
max_width = 1100 # max width before horizontal scrollbar needed
window.geometry(f"{min(width, max_width)}x{min(height, max_height)}")
window.mainloop()
def save_results_to_file(results_dict, domain_name, target_url):
merged_list = sorted(set(url for urls in results_dict.values() for url in urls))
filename = f"{domain_name}_merged_websites.txt"
with open(filename, "w") as f:
for url in merged_list:
f.write(url + "\n")
print(f"\n[+] Merged website list saved to {filename}")
input("Press Enter to check status of these websites...\n")
check_website_status(merged_list, domain_name, target_url)
def check_website_status(websites, domain_name, target_url):
grouped = {"up": [], "down": [], "phishing": [], "ssl": [], "error": []}
for site in websites:
try:
parsed = urlparse(site)
host = parsed.netloc
ip = socket.gethostbyname(host)
except Exception:
ip = None
try:
headers = {
"User-Agent": random.choice(user_agents),
"Accept": "text/html,application/xhtml+xml",
"Accept-Language": "en-US,en;q=0.9",
}
res = requests.get(site, headers=headers, timeout=10)
if res.status_code == 200:
grouped["up"].append(site)
print(f"[UP] {site}")
elif res.status_code == 403:
grouped["phishing"].append(site)
print(f"[PHISHING] {site} (403)")
elif res.status_code == 495 or "ssl" in res.text.lower():
grouped["ssl"].append(site)
print(f"[SSL] {site}")
else:
grouped["error"].append(site)
print(f"[ERROR] {site} - HTTP {res.status_code}")
except requests.exceptions.SSLError:
grouped["ssl"].append(site)
print(f"[SSL] {site}")
except requests.exceptions.ConnectionError:
grouped["down"].append(site)
print(f"[DOWN] {site}")
except Exception as e:
grouped["error"].append(site)
print(f"[ERROR] {site} - {e}")
time.sleep(5) # <-- Add this here to introduce delay
print("\n=== Summary ===")
for status in grouped:
print(f"{status.upper()}: {len(grouped[status])}")
domains_to_check = grouped["up"] + grouped["phishing"]
domain_dates = {}
for site in domains_to_check:
domain = urlparse(site).netloc
try:
w = whois.whois(domain)
reg_date = w.creation_date
if isinstance(reg_date, list):
reg_date = reg_date[0]
domain_dates[site] = str(reg_date).split()[0] if reg_date else "[no data]"
except Exception:
domain_dates[site] = "[whois error]"
print("\n=== Live or Phishing Sites with Registered Date ===")
for site, reg_date in domain_dates.items():
print(f"{site} — Registered on {reg_date}")
filename = f"{domain_name}_active_sites_with_dates.txt"
with open(filename, "w") as f:
f.write("Scanned Site\tClone Site\tRegistered on:\n")
for site, reg_date in domain_dates.items():
if isinstance(reg_date, datetime.datetime):
reg_date = reg_date.strftime('%Y-%m-%d')
f.write(f"{target_url}\t{site}\t{reg_date}\n")
print(f"\n[+] Saved live/phishing websites with registration dates to {filename}")
def main():
target_url = input("Enter the website URL to scan: ").strip()
domain_name = urlparse(target_url).netloc.replace("www.", "") # <--- ADD THIS
uuid = scan_website(API_KEY, target_url)
if not uuid:
return
result = wait_for_result(uuid)
if not result:
return
pairs = extract_images_and_hashes(result)
if not pairs:
print("[-] No image URLs with hashes found.")
return
domain_name = urlparse(target_url).netloc.replace("www.", "")
open_image_window(pairs, domain_name, target_url)
# Later, when you get the final list of websites to check:
# check_website_status(filtered_sites, domain_name)
if __name__ == "__main__":
main()