lex数软工作室

长期承接软件开发,数据采集,图像处理,技术咨询 微信:lex_workshop

python的静态网站迁移方案

前一段时间接到客户想,暂时迁移静态网站,可以使用如下方案处理:

代码梳理过程如下

使用mitm暂存所有请求的response,用于返回的网页

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import asyncio
from mitmproxy import options
from mitmproxy.tools import dump

from mitmproxy import http
import hashlib,pickle,os,threading,time,base64,json
from bs4 import BeautifulSoup as Soup
import requests
from urllib.parse import urljoin

mitmstore={}

def saveFile():
with open("store.pickle", 'wb') as fw:
pickle.dump(mitmstore, fw)

#自动加载模型
if os.path.exists("store.pickle"):
with open("store.pickle", 'rb') as fr:
mitmstore = pickle.load(fr)

def md5(url:str,data:str):
return hashlib.md5(url.encode('utf-8')+data.encode('utf-8')).hexdigest()

class Addon(object):

def request(self,flow: http.HTTPFlow):
request_url=flow.request.url #获得url
request_headers=flow.request.headers #获得访问消息头
request_body=flow.request.get_text() #访问提交的数据
request_method=flow.request.method #访问方法

id = md5(getUrlPath(request_url) + request_method, request_body) # 提取主键
if id in mitmstore:
print('request',id, request_url)
msg=mitmstore[id]
flow.response = http.Response.make(msg['response_status_code'], msg['response_content'],msg['response_headers'])

def response(self,flow: http.HTTPFlow):
request_url=flow.request.url #获得url
request_headers=flow.request.headers #获得访问消息头
request_body=flow.request.get_text() #访问提交的数据
request_method=flow.request.method #访问方法

response_status_code=flow.response.status_code #状态码
response_content=flow.response.content #访问内容
response_headers=flow.response.headers #获得访问

id=md5(getUrlPath(request_url) + request_method,request_body) #提取主键
if not id in mitmstore:
mitmstore[id]={'request_url':request_url,'request_headers':request_headers,'request_body': request_body,'request_method': request_method,'response_status_code': response_status_code,'response_content': response_content,'response_headers': response_headers}

async def start_proxy(port):
print('代理启动,端口号为',port)
opts = options.Options(listen_host='127.0.0.1', listen_port=port)

master = dump.DumpMaster(
opts,
with_termlog=False,
with_dumper=False,
)
master.addons.add(Addon())
await master.run()
return master

python通用爬虫用于采集要显示的静态网站

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
def getUrlPath(url:str):
return '/'.join(url.split('/')[3:])

class WebCrawlAll:
def __init__(self,firstUrl):
self.toCrawl=[]
self.toCrawl.append(firstUrl)
self.dup=set()
self.dup.add(firstUrl)
self.domain=self.getDomain(firstUrl)

def getDomain(self,url:str):
return '.'.join(url.split('/')[2].split('.')[1:])

def crawl(self,url):
print('采集链接:\t%s' % url)
aResp=requests.get(url,proxies = {'http': 'http://localhost:16767', 'https': 'http://localhost:16767'})
aDoc= Soup(aResp.text, 'html.parser')
#a标签
links=aDoc.select('a')
for link in links:
try:
href=link['href']
if href:
self.addUrl(url,href)
except:
pass

#img标签
imgs=aDoc.select('img')
for img in imgs:
try:
href=img['src']
if href:
self.addUrl(url,href)
except:
pass

#css选择
links=aDoc.select('link')
for link in links:
try:
href=link['href']
if href:
self.addUrl(url,href)
except:
pass

#css选择
links=aDoc.select('script')
for link in links:
try:
href=link['src']
if href:
self.addUrl(url,href)
except:
pass

#embed选择
links=aDoc.select('embed')
for link in links:
try:
href=link['src']
if href:
self.addUrl(url,href)
except:
pass

#css选择
links=aDoc.select('area')
for link in links:
try:
href=link['href']
if href:
self.addUrl(url,href)
except:
pass

def addUrl(self,url,href:str):
if len(href) > 0:
if href.startswith("http") and not href in self.dup:
urlDomain = self.getDomain(href)
if urlDomain == self.domain:
self.toCrawl.append(href)
self.dup.add(href)
if not href.startswith("http") and not 'javascript' in href:
hrefUrl = urljoin(url, href)
if not hrefUrl in self.dup:
urlDomain = self.getDomain(hrefUrl)
if urlDomain == self.domain:
self.toCrawl.append(hrefUrl)
self.dup.add(hrefUrl)

def crawlAll(self):
while len(self.toCrawl)>0:
url=self.toCrawl.pop()
self.crawl(url)
saveFile()
print('采集完成')

⬅️ Go back