Spaces:
Sleeping
Sleeping
Add urlscan.io to classify as junk malicious urls.
Browse files- app.py +17 -0
- requirements.txt +9 -6
- url_tools.py +29 -0
- urlscan_client.py +146 -0
app.py
CHANGED
|
@@ -1,11 +1,16 @@
|
|
|
|
|
| 1 |
from fastapi import FastAPI
|
| 2 |
from fastapi.responses import JSONResponse, FileResponse
|
| 3 |
from pydantic import BaseModel
|
| 4 |
from enum import Enum
|
| 5 |
from transformers import pipeline
|
| 6 |
from phishing_datasets import submit_entry
|
|
|
|
|
|
|
|
|
|
| 7 |
|
| 8 |
app = FastAPI()
|
|
|
|
| 9 |
|
| 10 |
class MessageModel(BaseModel):
|
| 11 |
text: str
|
|
@@ -64,6 +69,18 @@ def get_robot_txt():
|
|
| 64 |
@app.post("/predict")
|
| 65 |
def predict(model: InputModel) -> OutputModel:
|
| 66 |
text = model.query.message.text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
label = pipe(text)
|
| 68 |
if label[0]['label'] == 'LABEL_1':
|
| 69 |
submit_entry(model.query.sender, model.query.message.text)
|
|
|
|
| 1 |
+
import httpx
|
| 2 |
from fastapi import FastAPI
|
| 3 |
from fastapi.responses import JSONResponse, FileResponse
|
| 4 |
from pydantic import BaseModel
|
| 5 |
from enum import Enum
|
| 6 |
from transformers import pipeline
|
| 7 |
from phishing_datasets import submit_entry
|
| 8 |
+
from url_tools import extract_urls, resolve_short_url
|
| 9 |
+
from urlscan_client import UrlscanClient
|
| 10 |
+
import requests
|
| 11 |
|
| 12 |
app = FastAPI()
|
| 13 |
+
urlscan = UrlscanClient()
|
| 14 |
|
| 15 |
class MessageModel(BaseModel):
|
| 16 |
text: str
|
|
|
|
| 69 |
@app.post("/predict")
|
| 70 |
def predict(model: InputModel) -> OutputModel:
|
| 71 |
text = model.query.message.text
|
| 72 |
+
|
| 73 |
+
urls = extract_urls(text)
|
| 74 |
+
results = [urlscan.scan(url) for url in urls]
|
| 75 |
+
|
| 76 |
+
for result in results:
|
| 77 |
+
overall = result.get('verdicts', {}).get('overall', {})
|
| 78 |
+
print(f"Checking verdict: {overall}")
|
| 79 |
+
if overall.get('hasVerdicts') and overall.get('score') > 0:
|
| 80 |
+
print("Match found. Submitting entry and returning JUNK.")
|
| 81 |
+
submit_entry(model.query.sender, model.query.message.text)
|
| 82 |
+
return OutputModel(action=ActionModel.JUNK, sub_action=SubActionModel.NONE)
|
| 83 |
+
|
| 84 |
label = pipe(text)
|
| 85 |
if label[0]['label'] == 'LABEL_1':
|
| 86 |
submit_entry(model.query.sender, model.query.message.text)
|
requirements.txt
CHANGED
|
@@ -1,7 +1,10 @@
|
|
| 1 |
-
fastapi
|
| 2 |
-
uvicorn[standard]
|
| 3 |
-
pydantic
|
| 4 |
-
transformers
|
| 5 |
torch
|
| 6 |
-
datasets
|
| 7 |
-
pandas
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastapi~=0.115.12
|
| 2 |
+
uvicorn[standard]~=0.34.2
|
| 3 |
+
pydantic~=2.11.4
|
| 4 |
+
transformers~=4.51.3
|
| 5 |
torch
|
| 6 |
+
datasets~=3.6.0
|
| 7 |
+
pandas~=2.2.3
|
| 8 |
+
httpx~=0.28.1
|
| 9 |
+
numpy~=2.2.5
|
| 10 |
+
requests~=2.32.3
|
url_tools.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
from urllib.parse import urlparse, urlunparse
|
| 3 |
+
import httpx
|
| 4 |
+
|
| 5 |
+
def extract_urls(text: str):
|
| 6 |
+
"""Extract URLs from raw text."""
|
| 7 |
+
url_pattern = r"""(?:(?:https?:\/\/|www\.)?[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})(?:\/[^\s<>"']*)?"""
|
| 8 |
+
return re.findall(url_pattern, text)
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def normalize_url(url: str) -> str:
|
| 12 |
+
"""Ensure the URL has a scheme and is normalized."""
|
| 13 |
+
parsed = urlparse(url, scheme="http")
|
| 14 |
+
if not parsed.netloc:
|
| 15 |
+
parsed = urlparse("http://" + url)
|
| 16 |
+
return urlunparse(parsed)
|
| 17 |
+
|
| 18 |
+
def resolve_short_url(url: str) -> str:
|
| 19 |
+
"""Make a HEAD request without following redirects, return the Location if redirected."""
|
| 20 |
+
url = normalize_url(url)
|
| 21 |
+
try:
|
| 22 |
+
with httpx.Client(follow_redirects=False, timeout=5) as client:
|
| 23 |
+
response = client.head(url, headers={"User-Agent": "Mozilla/5.0"})
|
| 24 |
+
if response.status_code in {301, 302, 303, 307, 308}:
|
| 25 |
+
return response.headers.get("location")
|
| 26 |
+
return url # No redirect
|
| 27 |
+
except httpx.RequestError as e:
|
| 28 |
+
print(f"Error: {e}")
|
| 29 |
+
return url
|
urlscan_client.py
ADDED
|
@@ -0,0 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
UrlscanClient: A simple Python client for interacting with the urlscan.io API.
|
| 3 |
+
|
| 4 |
+
This client allows you to:
|
| 5 |
+
- Submit a URL to be scanned.
|
| 6 |
+
- Retrieve scan results by UUID.
|
| 7 |
+
- Search existing scans with a query.
|
| 8 |
+
- Perform a full scan workflow with error handling.
|
| 9 |
+
|
| 10 |
+
Environment Variable:
|
| 11 |
+
URLSCAN_API_KEY (str): If not passed during initialization, the client will attempt to use this environment variable.
|
| 12 |
+
|
| 13 |
+
Dependencies:
|
| 14 |
+
- requests
|
| 15 |
+
- os
|
| 16 |
+
- time
|
| 17 |
+
"""
|
| 18 |
+
|
| 19 |
+
import os
|
| 20 |
+
import time
|
| 21 |
+
import requests
|
| 22 |
+
|
| 23 |
+
class UrlscanClient:
|
| 24 |
+
"""
|
| 25 |
+
A client to interact with the urlscan.io API for submitting URLs, retrieving scan results,
|
| 26 |
+
and searching the scan database.
|
| 27 |
+
"""
|
| 28 |
+
BASE_URL = "https://urlscan.io/api/v1"
|
| 29 |
+
|
| 30 |
+
def __init__(self, api_key=None):
|
| 31 |
+
"""
|
| 32 |
+
Initialize the UrlscanClient.
|
| 33 |
+
|
| 34 |
+
Parameters:
|
| 35 |
+
api_key (str, optional): Your urlscan.io API key. If not provided, it is read from
|
| 36 |
+
the URLSCAN_API_KEY environment variable.
|
| 37 |
+
|
| 38 |
+
Raises:
|
| 39 |
+
ValueError: If the API key is not provided or found in environment variables.
|
| 40 |
+
"""
|
| 41 |
+
self.api_key = api_key or os.getenv("URLSCAN_API_KEY")
|
| 42 |
+
if not self.api_key:
|
| 43 |
+
raise ValueError("API key is required. Set it via parameter or the URLSCAN_API_KEY environment variable.")
|
| 44 |
+
self.headers = {
|
| 45 |
+
"API-Key": self.api_key,
|
| 46 |
+
"Content-Type": "application/json"
|
| 47 |
+
}
|
| 48 |
+
|
| 49 |
+
def submit_url(self, url, visibility="public", tags=None, **options):
|
| 50 |
+
"""
|
| 51 |
+
Submit a URL for scanning.
|
| 52 |
+
|
| 53 |
+
Parameters:
|
| 54 |
+
url (str): The URL to scan.
|
| 55 |
+
visibility (str): Scan visibility ('public', 'unlisted', or 'private'). Defaults to 'public'.
|
| 56 |
+
tags (list, optional): Optional list of tags to associate with the scan.
|
| 57 |
+
**options: Additional scan options like 'useragent', 'referer', 'country', etc.
|
| 58 |
+
|
| 59 |
+
Returns:
|
| 60 |
+
dict: JSON response from the submission API.
|
| 61 |
+
|
| 62 |
+
Raises:
|
| 63 |
+
requests.HTTPError: If the request fails.
|
| 64 |
+
"""
|
| 65 |
+
payload = {
|
| 66 |
+
"url": url,
|
| 67 |
+
"visibility": visibility,
|
| 68 |
+
"country": "fr",
|
| 69 |
+
"tags": tags or []
|
| 70 |
+
}
|
| 71 |
+
payload.update(options)
|
| 72 |
+
response = requests.post(f"{self.BASE_URL}/scan/", headers=self.headers, json=payload)
|
| 73 |
+
response.raise_for_status()
|
| 74 |
+
return response.json()
|
| 75 |
+
|
| 76 |
+
def get_result(self, uuid, wait=True, timeout=60):
|
| 77 |
+
"""
|
| 78 |
+
Retrieve the result of a scan by UUID.
|
| 79 |
+
|
| 80 |
+
Parameters:
|
| 81 |
+
uuid (str): The UUID of the scan result.
|
| 82 |
+
wait (bool): Whether to wait for the scan to complete if it's not yet ready. Defaults to True.
|
| 83 |
+
timeout (int): Maximum time (in seconds) to wait for the result if wait is True. Defaults to 60.
|
| 84 |
+
|
| 85 |
+
Returns:
|
| 86 |
+
dict: The scan result data.
|
| 87 |
+
|
| 88 |
+
Raises:
|
| 89 |
+
TimeoutError: If the result is not ready within the timeout period.
|
| 90 |
+
requests.HTTPError: If another HTTP error occurs.
|
| 91 |
+
"""
|
| 92 |
+
result_url = f"{self.BASE_URL}/result/{uuid}/"
|
| 93 |
+
start_time = time.time()
|
| 94 |
+
while True:
|
| 95 |
+
response = requests.get(result_url, headers=self.headers)
|
| 96 |
+
if response.status_code == 200:
|
| 97 |
+
return response.json()
|
| 98 |
+
elif response.status_code == 404:
|
| 99 |
+
if not wait or (time.time() - start_time) > timeout:
|
| 100 |
+
raise TimeoutError("Scan result not available yet.")
|
| 101 |
+
time.sleep(5)
|
| 102 |
+
else:
|
| 103 |
+
response.raise_for_status()
|
| 104 |
+
|
| 105 |
+
def search(self, query, size=10):
|
| 106 |
+
"""
|
| 107 |
+
Search for past scans using a query string.
|
| 108 |
+
|
| 109 |
+
Parameters:
|
| 110 |
+
query (str): The search query, such as a domain name or IP address.
|
| 111 |
+
size (int): Maximum number of results to return. Defaults to 10.
|
| 112 |
+
|
| 113 |
+
Returns:
|
| 114 |
+
dict: Search results from urlscan.io.
|
| 115 |
+
|
| 116 |
+
Raises:
|
| 117 |
+
requests.HTTPError: If the request fails.
|
| 118 |
+
"""
|
| 119 |
+
params = {"q": query, "size": size}
|
| 120 |
+
response = requests.get(f"{self.BASE_URL}/search/", headers=self.headers, params=params)
|
| 121 |
+
response.raise_for_status()
|
| 122 |
+
return response.json()
|
| 123 |
+
|
| 124 |
+
def scan(self, url: str):
|
| 125 |
+
"""
|
| 126 |
+
Convenience method to submit a scan and retrieve the result.
|
| 127 |
+
|
| 128 |
+
Parameters:
|
| 129 |
+
url (str): The URL to scan.
|
| 130 |
+
|
| 131 |
+
Returns:
|
| 132 |
+
dict: The scan result, or a fallback result if the scan fails.
|
| 133 |
+
"""
|
| 134 |
+
try:
|
| 135 |
+
print(f"Submit url {url}")
|
| 136 |
+
submission = self.submit_url(url=url, visibility="public")
|
| 137 |
+
print(f"Submitted scan. UUID: {submission['uuid']}")
|
| 138 |
+
result = self.get_result(submission["uuid"])
|
| 139 |
+
print(f"Submission succeed. UUID: {submission['uuid']}")
|
| 140 |
+
return result
|
| 141 |
+
except requests.exceptions.RequestException as e:
|
| 142 |
+
print(f"Submission failed {e}")
|
| 143 |
+
return {
|
| 144 |
+
'page': {'url': url},
|
| 145 |
+
'verdicts': {'overall': {'hasVerdicts': False, 'score': 0}}
|
| 146 |
+
}
|