漏洞利用-链接提取器
本文最后更新于 420 天前,其中的信息可能已经有所发展或是发生改变。

:::info
💘渗透全流程:
信息收集 – 漏洞发现 – 漏洞👣利用 – 权限提升 – 隧道搭建 – 内网渗透 – 横向移动 – 后渗透
:::

链接提取器

参考 findall

  1. 解析网页
  2. 提取属性。href
  3. 拼接完整 url
  4. 分析外联还是内链
  5. 结果保存

V1.0

#! /usr/bin/env python

'''
链接提取器

'''

import requests
from bs4 import BeautifulSoup as bs
from urllib.parse import urljoin, urlparse
import logging

# 内链
internal_urls = set()
# 外链
external_urls = set()

'''
检查是否合法的 url
'''
def is_valid(url):
    parsed = urlparse(url)
    loc = bool(parsed.netloc) # 域名
    sche = bool(parsed.scheme) # 协议
    return loc and sche

'''
获取所有链接
'''
def get_allwebsite_links(url):
    urls = set()
    domain_name = urlparse(url).netloc
    soup = bs(requests.get(url).content, 'lxml')

    for a_tag in soup.find_all('a'):
        href = a_tag.attrs.get('href')
        if href == '' or href is None:
            continue
        href = urljoin(url, href)
        # 剔除 get 请求后面的参数
        parsed_href = urlparse(href)
        href = f'{parsed_href.scheme}://{parsed_href.netloc}{parsed_href.path}'
        if not is_valid(href):
            continue
        if href in internal_urls:
            continue
        # 判断:如果是外链
        if domain_name not in href:
            if href not in external_urls:
                print(f'[*] url is an external_url : {href}')
                external_urls.add(href)
            continue
        # 如果是内链
        print(f'[*] url is an internal_url : {href}')
        internal_urls.add(href)
        urls.add(href)
    return urls

'''
递归调用
'''
total_urls_visited = 0
def crawl(url, max_url = 3):
    global total_urls_visited
    total_urls_visited += 1
    print(f'visiting {url}')
    links = get_allwebsite_links(url)
    for link in links:
        if total_urls_visited > max_url:
            break
        crawl(link, max_url=max_url)

if __name__ == '__main__':
    url = 'http://www.mashibing.com'
    is_valid(url)
    crawl(url, 15)
    print('[+] find completed')
    print('[+] find internal_urls: %s' % len(internal_urls))
    print('[+] find external_urls: %s' % len(external_urls))

V2.0

优化:

  • 外链与内链分别输出到文档保存
#! /usr/bin/env python

'''
链接提取器

'''

import requests
from bs4 import BeautifulSoup as bs
from urllib.parse import urljoin, urlparse
import logging

# 内链
internal_urls = set()
# 外链
external_urls = set()

'''
检查是否合法的 url
'''
def is_valid(url):
    parsed = urlparse(url)
    loc = bool(parsed.netloc) # 域名
    sche = bool(parsed.scheme) # 协议
    return loc and sche

'''
获取所有链接
'''
def get_allwebsite_links(url):
    urls = set()
    domain_name = urlparse(url).netloc
    soup = bs(requests.get(url).content, 'lxml')

    for a_tag in soup.find_all('a'):
        href = a_tag.attrs.get('href')
        if href == '' or href is None:
            continue
        href = urljoin(url, href)
        # 剔除 get 请求后面的参数
        parsed_href = urlparse(href)
        href = f'{parsed_href.scheme}://{parsed_href.netloc}{parsed_href.path}'
        if not is_valid(href):
            continue
        if href in internal_urls:
            continue
        # 判断:如果是外链
        if domain_name not in href:
            if href not in external_urls:
                # print(f'[*] url is an external_url : {href}')
                external_urls.add(href)
            continue
        # 如果是内链
        # print(f'[*] url is an internal_url : {href}')
        internal_urls.add(href)
        urls.add(href)
    return urls

'''
递归调用
'''
total_urls_visited = 0
def crawl(url, max_url = 3):
    global total_urls_visited
    total_urls_visited += 1
    print(f'visiting {url}')
    links = get_allwebsite_links(url)
    for link in links:
        if total_urls_visited > max_url:
            break
        crawl(link, max_url=max_url)

if __name__ == '__main__':
    url = 'http://www.baidu.com'
    is_valid(url)
    crawl(url, 15)
    domain_name = urlparse(url).netloc
    print('[+] find completed')
    print('[+] find internal_urls: %s' % len(internal_urls))
    print('[+] find external_urls: %s' % len(external_urls))

    with open(f'{domain_name}_int_links', 'w') as f:
        for x in internal_urls:
            print(x.strip(), file=f)
    with open(f'{domain_name}_ext_links', 'w') as f:
        for x in external_urls:
            print(x.strip(), file=f)
学海无涯,回头是岸。 --- hola
暂无评论

发送评论 编辑评论


				
|´・ω・)ノ
ヾ(≧∇≦*)ゝ
(☆ω☆)
(╯‵□′)╯︵┴─┴
 ̄﹃ ̄
(/ω\)
∠( ᐛ 」∠)_
(๑•̀ㅁ•́ฅ)
→_→
୧(๑•̀⌄•́๑)૭
٩(ˊᗜˋ*)و
(ノ°ο°)ノ
(´இ皿இ`)
⌇●﹏●⌇
(ฅ´ω`ฅ)
(╯°A°)╯︵○○○
φ( ̄∇ ̄o)
ヾ(´・ ・`。)ノ"
( ง ᵒ̌皿ᵒ̌)ง⁼³₌₃
(ó﹏ò。)
Σ(っ °Д °;)っ
( ,,´・ω・)ノ"(´っω・`。)
╮(╯▽╰)╭
o(*////▽////*)q
>﹏<
( ๑´•ω•) "(ㆆᴗㆆ)
😂
😀
😅
😊
🙂
🙃
😌
😍
😘
😜
😝
😏
😒
🙄
😳
😡
😔
😫
😱
😭
💩
👻
🙌
🖕
👍
👫
👬
👭
🌚
🌝
🙈
💊
😶
🙏
🍦
🍉
😣
Source: github.com/k4yt3x/flowerhd
颜文字
Emoji
小恐龙
花!
上一篇
下一篇