Advanced Usage¶
This guide covers more advanced features and techniques for using liburlparser in Python.
Working with the Public Suffix List (PSL)¶
liburlparser uses the Public Suffix List to accurately parse domain suffixes. You can interact with the PSL through the global psl
object:
from liburlparser import psl
# Check if PSL is loaded
print(psl.is_loaded()) # True (by default)
# Get PSL information
print(psl.url) # URL of the public suffix list
print(psl.filename) # Filename of the public suffix list
# Load PSL from a custom path
psl.load_from_path("/path/to/custom/public_suffix_list.dat")
# Load PSL from a string
with open("/path/to/custom/public_suffix_list.dat", "r") as f:
psl_content = f.read()
psl.load_from_string(psl_content)
Ignoring "www" Subdomain¶
You can choose to ignore the "www" subdomain when parsing:
# Default behavior
host = Host("www.example.com")
print(host.subdomain) # www
print(host.domain) # example
# Ignore www
host = Host("www.example.com", ignore_www=True)
print(host.subdomain) # (empty string)
print(host.domain) # example
# Same for URLs
url = Url("https://www.example.com/about", ignore_www=True)
print(url.subdomain) # (empty string)
print(url.domain) # example
Removing "www" from a Host String¶
If you just need to remove "www." from a hostname:
from liburlparser import Host
host_str = Host.removeWWW("www.example.com")
print(host_str) # example.com
Handling Complex Domains¶
liburlparser can handle complex domain structures, including multi-part suffixes:
# Standard TLDs
host = Host("example.com")
print(host.domain) # example
print(host.suffix) # com
# Country-specific TLDs
host = Host("example.co.uk")
print(host.domain) # example
print(host.suffix) # co.uk
# Multi-level subdomains
host = Host("a.b.c.example.com")
print(host.subdomain) # a.b.c
print(host.domain) # example
print(host.suffix) # com
Compatibility with Other Libraries¶
If you're migrating from pydomainextractor, liburlparser provides compatible methods:
# pydomainextractor style:
import pydomainextractor
extractor = pydomainextractor.DomainExtractor()
result = extractor.extract("mail.google.com")
result = extractor.extract_from_url("https://mail.google.com/about")
# liburlparser equivalent:
from liburlparser import Host
result = Host.extract("mail.google.com")
result = Host.extract_from_url("https://mail.google.com/about")
Error Handling¶
liburlparser is designed to be robust and handle various edge cases:
# Empty or invalid URLs
try:
url = Url("")
except Exception as e:
print(f"Error: {e}")
# URLs without protocols
url = Url("example.com")
print(url.protocol) # (empty string)
print(url.domain) # example
# URLs with unusual formats
url = Url("https://user:pass@example.com:8080/path?query=value#fragment")
print(url.userinfo) # user:pass
print(url.port) # 8080
print(url.query) # query=value
Performance Optimization¶
For high-performance applications, consider these techniques:
- Use
Host.from_url()
instead of creating aUrl
object when you only need host information:
# Slower (creates full Url object)
url = Url("https://example.com/path")
host = url.host
# Faster (directly extracts host)
host = Host.from_url("https://example.com/path")
- Use
Url.extract_host()
when you only need the host string:
# Fastest (just extracts host string without parsing)
host_str = Url.extract_host("https://example.com/path")
- Use
Host.extract()
orHost.extract_from_url()
when you only need domain components:
# Fast (returns just the components you need)
components = Host.extract("example.com")
components = Host.extract_from_url("https://example.com/path")
Working with International Domain Names (IDNs)¶
liburlparser has built-in support for International Domain Names (IDNs):
# Parse IDNs directly
host = Host("例子.测试")
print(host.domain) # 例子
print(host.suffix) # 测试
# IDNs in URLs
url = Url("https://例子.测试/path")
print(url.domain) # 例子
print(url.suffix) # 测试
# Mixed IDN and ASCII
host = Host("subdomain.例子.测试")
print(host.subdomain) # subdomain
print(host.domain) # 例子
print(host.suffix) # 测试
Batch Processing¶
For processing large numbers of URLs efficiently:
from liburlparser import Host
import csv
def extract_domains_from_csv(csv_file, url_column, output_file):
with open(csv_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8', newline='') as outfile:
reader = csv.DictReader(infile)
fieldnames = list(reader.fieldnames) + ['domain', 'suffix', 'subdomain']
writer = csv.DictWriter(outfile, fieldnames=fieldnames)
writer.writeheader()
for row in reader:
url = row[url_column]
try:
# Use the fast extract_from_url method
domain_info = Host.extract_from_url(url)
row['domain'] = domain_info['domain']
row['suffix'] = domain_info['suffix']
row['subdomain'] = domain_info['subdomain']
except Exception:
row['domain'] = ''
row['suffix'] = ''
row['subdomain'] = ''
writer.writerow(row)
# Example usage
# extract_domains_from_csv('urls.csv', 'url_column', 'output.csv')
Integration with Other Libraries¶
Using with requests¶
import requests
from liburlparser import Url, Host
def get_domain_info(url_str):
# Parse the URL first to validate it
url = Url(url_str)
# Make the request
response = requests.get(url_str)
# Check if there were redirects
if response.history:
final_url = response.url
final_domain = Host.from_url(final_url).domain
print(f"Redirected from {url.domain} to {final_domain}")
return {
'original_domain': url.domain,
'final_domain': Host.from_url(response.url).domain,
'status_code': response.status_code
}
# Example usage
# info = get_domain_info("http://github.com")
Using with pandas¶
import pandas as pd
from liburlparser import Host
def extract_domains(df, url_column):
# Create a function to extract domain info
def get_domain_info(url):
try:
info = Host.extract_from_url(url)
return pd.Series([
info['domain'],
info['suffix'],
info['subdomain']
])
except:
return pd.Series(['', '', ''])
# Apply the function to the URL column
domain_info = df[url_column].apply(get_domain_info)
domain_info.columns = ['domain', 'suffix', 'subdomain']
# Join the results back to the original dataframe
return pd.concat([df, domain_info], axis=1)
# Example usage
# df = pd.read_csv('urls.csv')
# df_with_domains = extract_domains(df, 'url_column')
Handling Edge Cases¶
liburlparser is designed to handle various edge cases:
# IP addresses as hosts
url = Url("https://192.168.1.1/path")
print(url.host) # <Host :'192.168.1.1'>
print(url.domain) # (empty string, as IPs don't have domains)
# Localhost
url = Url("http://localhost:8080")
print(url.host) # <Host :'localhost'>
print(url.domain) # localhost
print(url.suffix) # (empty string)
# URLs with unusual ports
url = Url("https://example.com:65535/path")
print(url.port) # 65535
# URLs with empty paths
url = Url("https://example.com")
print(url.path) # (empty string)
# URLs with query parameters but no path
url = Url("https://example.com?param=value")
print(url.query) # param=value
Custom PSL Rules¶
You can create and use a custom Public Suffix List with additional rules:
from liburlparser import psl
# Create a custom PSL with additional rules
custom_psl = """
// Standard PSL rules
com
org
net
// Custom rules for internal domains
internal
local
dev
test
"""
# Load the custom PSL
psl.load_from_string(custom_psl)
# Now parse domains with custom suffixes
host = Host("example.internal")
print(host.domain) # example
print(host.suffix) # internal
host = Host("server.dev")
print(host.domain) # server
print(host.suffix) # dev
Thread Safety¶
liburlparser is thread-safe for parsing operations, but PSL loading should be done before spawning threads:
import threading
from liburlparser import Host, psl
import time
# Make sure PSL is loaded before creating threads
if not psl.is_loaded():
psl.load_from_path("/path/to/public_suffix_list.dat")
def parse_hosts(hosts, results, thread_id):
for host_str in hosts:
try:
host = Host(host_str)
results.append({
'thread': thread_id,
'host': host_str,
'domain': host.domain,
'suffix': host.suffix
})
except Exception as e:
results.append({
'thread': thread_id,
'host': host_str,
'error': str(e)
})
# Example usage
def multi_threaded_parsing(all_hosts, num_threads=4):
threads = []
results = []
# Split hosts among threads
chunk_size = len(all_hosts) // num_threads
for i in range(num_threads):
start = i * chunk_size
end = start + chunk_size if i < num_threads - 1 else len(all_hosts)
hosts_chunk = all_hosts[start:end]
thread = threading.Thread(
target=parse_hosts,
args=(hosts_chunk, results, i)
)
threads.append(thread)
thread.start()
# Wait for all threads to complete
for thread in threads:
thread.join()
return results
# Example usage
# hosts = ["example.com", "google.com", "github.com", ...] * 1000
# results = multi_threaded_parsing(hosts, num_threads=8)
Memory Management¶
For processing very large datasets with minimal memory usage:
from liburlparser import Host
import csv
def process_urls_streaming(input_file, output_file, batch_size=10000):
with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8', newline='') as outfile:
reader = csv.reader(infile)
writer = csv.writer(outfile)
writer.writerow(['url', 'domain', 'suffix', 'subdomain'])
batch = []
for i, row in enumerate(reader):
if i == 0: # Skip header
continue
url = row[0]
batch.append(url)
# Process in batches to avoid memory issues
if len(batch) >= batch_size:
process_batch(batch, writer)
batch = []
# Process remaining URLs
if batch:
process_batch(batch, writer)
def process_batch(urls, writer):
for url in urls:
try:
info = Host.extract_from_url(url)
writer.writerow([
url,
info['domain'],
info['suffix'],
info['subdomain']
])
except Exception:
writer.writerow([url, '', '', ''])
# Example usage
# process_urls_streaming('millions_of_urls.csv', 'domains.csv', batch_size=50000)
Advanced URL Parsing¶
For more complex URL parsing needs:
from liburlparser import Url
import urllib.parse
def parse_query_params(url_str):
url = Url(url_str)
query = url.query
# Parse query parameters
params = urllib.parse.parse_qs(query)
return {
'url': url_str,
'domain': url.domain,
'path': url.path,
'params': params
}
# Example
result = parse_query_params("https://example.com/search?q=test&page=1&filter=active")
print(result)
# {
# 'url': 'https://example.com/search?q=test&page=1&filter=active',
# 'domain': 'example',
# 'path': '/search',
# 'params': {'q': ['test'], 'page': ['1'], 'filter': ['active']}
# }
Benchmarking¶
You can benchmark liburlparser against other libraries:
import time
from liburlparser import Host
# Try to import other libraries
try:
import tldextract
has_tldextract = True
except ImportError:
has_tldextract = False
try:
import publicsuffix2
has_publicsuffix2 = True
except ImportError:
has_publicsuffix2 = False
def benchmark(urls, iterations=3):
results = {}
# Benchmark liburlparser
total_time = 0
for _ in range(iterations):
start = time.time()
for url in urls:
Host.extract_from_url(url)
total_time += time.time() - start
results['liburlparser'] = total_time / iterations
# Benchmark tldextract if available
if has_tldextract:
total_time = 0
for _ in range(iterations):
start = time.time()
for url in urls:
tldextract.extract(url)
total_time += time.time() - start
results['tldextract'] = total_time / iterations
# Benchmark publicsuffix2 if available
if has_publicsuffix2:
total_time = 0
for _ in range(iterations):
start = time.time()
for url in urls:
try:
publicsuffix2.get_sld(url)
except:
pass
total_time += time.time() - start
results['publicsuffix2'] = total_time / iterations
return results
# Example usage
# urls = ["https://example.com", "https://mail.google.com", ...] * 1000
# results = benchmark(urls)
# print(results)
Conclusion¶
This advanced guide covers the more complex aspects of using liburlparser in Python. By leveraging these techniques, you can build high-performance applications that efficiently process and analyze URLs and domains.
For more information, check out the API Reference or the Examples page.