本文最后更新于 420 天前,其中的信息可能已经有所发展或是发生改变。
:::info
💘渗透全流程:
信息收集 – 漏洞发现 – 漏洞👣利用 – 权限提升 – 隧道搭建 – 内网渗透 – 横向移动 – 后渗透
:::
链接提取器
参考 findall
- 解析网页
- 提取属性。href
- 拼接完整 url
- 分析外联还是内链
- 结果保存
V1.0
#! /usr/bin/env python
'''
链接提取器
'''
import requests
from bs4 import BeautifulSoup as bs
from urllib.parse import urljoin, urlparse
import logging
# 内链
internal_urls = set()
# 外链
external_urls = set()
'''
检查是否合法的 url
'''
def is_valid(url):
parsed = urlparse(url)
loc = bool(parsed.netloc) # 域名
sche = bool(parsed.scheme) # 协议
return loc and sche
'''
获取所有链接
'''
def get_allwebsite_links(url):
urls = set()
domain_name = urlparse(url).netloc
soup = bs(requests.get(url).content, 'lxml')
for a_tag in soup.find_all('a'):
href = a_tag.attrs.get('href')
if href == '' or href is None:
continue
href = urljoin(url, href)
# 剔除 get 请求后面的参数
parsed_href = urlparse(href)
href = f'{parsed_href.scheme}://{parsed_href.netloc}{parsed_href.path}'
if not is_valid(href):
continue
if href in internal_urls:
continue
# 判断:如果是外链
if domain_name not in href:
if href not in external_urls:
print(f'[*] url is an external_url : {href}')
external_urls.add(href)
continue
# 如果是内链
print(f'[*] url is an internal_url : {href}')
internal_urls.add(href)
urls.add(href)
return urls
'''
递归调用
'''
total_urls_visited = 0
def crawl(url, max_url = 3):
global total_urls_visited
total_urls_visited += 1
print(f'visiting {url}')
links = get_allwebsite_links(url)
for link in links:
if total_urls_visited > max_url:
break
crawl(link, max_url=max_url)
if __name__ == '__main__':
url = 'http://www.mashibing.com'
is_valid(url)
crawl(url, 15)
print('[+] find completed')
print('[+] find internal_urls: %s' % len(internal_urls))
print('[+] find external_urls: %s' % len(external_urls))
V2.0
优化:
- 外链与内链分别输出到文档保存
#! /usr/bin/env python
'''
链接提取器
'''
import requests
from bs4 import BeautifulSoup as bs
from urllib.parse import urljoin, urlparse
import logging
# 内链
internal_urls = set()
# 外链
external_urls = set()
'''
检查是否合法的 url
'''
def is_valid(url):
parsed = urlparse(url)
loc = bool(parsed.netloc) # 域名
sche = bool(parsed.scheme) # 协议
return loc and sche
'''
获取所有链接
'''
def get_allwebsite_links(url):
urls = set()
domain_name = urlparse(url).netloc
soup = bs(requests.get(url).content, 'lxml')
for a_tag in soup.find_all('a'):
href = a_tag.attrs.get('href')
if href == '' or href is None:
continue
href = urljoin(url, href)
# 剔除 get 请求后面的参数
parsed_href = urlparse(href)
href = f'{parsed_href.scheme}://{parsed_href.netloc}{parsed_href.path}'
if not is_valid(href):
continue
if href in internal_urls:
continue
# 判断:如果是外链
if domain_name not in href:
if href not in external_urls:
# print(f'[*] url is an external_url : {href}')
external_urls.add(href)
continue
# 如果是内链
# print(f'[*] url is an internal_url : {href}')
internal_urls.add(href)
urls.add(href)
return urls
'''
递归调用
'''
total_urls_visited = 0
def crawl(url, max_url = 3):
global total_urls_visited
total_urls_visited += 1
print(f'visiting {url}')
links = get_allwebsite_links(url)
for link in links:
if total_urls_visited > max_url:
break
crawl(link, max_url=max_url)
if __name__ == '__main__':
url = 'http://www.baidu.com'
is_valid(url)
crawl(url, 15)
domain_name = urlparse(url).netloc
print('[+] find completed')
print('[+] find internal_urls: %s' % len(internal_urls))
print('[+] find external_urls: %s' % len(external_urls))
with open(f'{domain_name}_int_links', 'w') as f:
for x in internal_urls:
print(x.strip(), file=f)
with open(f'{domain_name}_ext_links', 'w') as f:
for x in external_urls:
print(x.strip(), file=f)