Python解决jm爬取图片分层倒置3.0版本解决方案
但是在我在“书海”里畅游(看🐍🐍)的时候发现了一些错误 :有些区分度很低的图片恢复之后会是乱上加乱。再采用CV2库里的匹配算法,通过获取水印位置来获得分页页数,极大减少了无用的循环,一张图还原时间大概为150~250ms,(v1.0版本的程序还原时间约为3秒)这样便能正确的还原jm的图片(但是我没法子大量实验,爬jm太多导致ip被封了) ,速度大概在280ms一张图左右。很明显,图片的水印并不
(蛇年说python,python搞爬虫,爬虫爬🐍🐍。祝大家除夕快乐)
关于jm的图像分层倒置问题,我上一篇文章给出里一个看似正确的结果。但是在我在“书海”里畅游(看🐍🐍)的时候发现了一些错误 :有些区分度很低的图片恢复之后会是乱上加乱。以下给张例图

很明显,图片的水印并不在底端且色彩不连续,明显能判断这张图片还原的有问题。
于是我选择更改方式,选择以jm水印为基准,通过ssim计算切片相似度。
之所以选择ssim算法,原因是其可以计算结构相似度。
而程序的逻辑便是:
按照几种情况穷举还原结果。
读取还原结果,截取左下角与右下角各200*50px的区域
对左、右下角和标准jm水印进行canny边缘识别提取特征
比对提取到的边缘
相似度最高的便是正确结果
以下便是按照上述逻辑编写的代码
#jm图片还原Version2.0
from skimage.metrics import structural_similarity as ssim
import cv2
from PIL import Image
import math
import os
import sys
import numpy as np
import shutil
#源文件计数
def CountFiles(directory):
num_files = 0
for root, dirs, files in os.walk(directory):
num_files += len(files)
return num_files
#取最大值序号
def GetMaxIndex(arr):
MaxVal = max(arr)
MaxIndex = arr.index(MaxVal)
return MaxIndex
#输出最大值
def GetBig(a, b):
if a > b:
return a
elif a < b:
return b
elif a == b:
return a
#获取对应分区数的分割位置
def GetPartHeight(Height, PartNum):
PartHeight = [0]
AvePartHeight = math.floor(Height / PartNum)
LastPartHeight = AvePartHeight + Height % PartNum
for i in range(1, PartNum):
PartHeight.append(AvePartHeight * i)
PartHeight.append(Height)
return PartHeight
#获取还原时的粘贴位置
def GetReCanvasPartHeight(Height, PartNum):
AvePartHeight = math.floor(Height / PartNum)
LastPartHeight = AvePartHeight + Height % PartNum
RePartHeight = [0, LastPartHeight]
for i in range(1, PartNum):
RePartHeight.append(LastPartHeight + AvePartHeight * i)
return RePartHeight
#复原照片
def ReCanvas(PartNum, InImage):
PartCon = []
Height = InImage.height
Width = InImage.width
HeightGroup = GetPartHeight(Height, PartNum)
ReHeightGroup = GetReCanvasPartHeight(Height, PartNum)
for i in range(PartNum):
PartCoor = (0, HeightGroup[i], Width, HeightGroup[ i + 1 ])
Part = InImage.crop(PartCoor)
PartCon.append(Part)
RePartCon = PartCon[::-1]
OutImage = Image.new( 'RGB', (Width, Height))
for i in range(PartNum):
OutImage.paste(RePartCon[i], (0, ReHeightGroup[i]))
return OutImage
#边缘jiance
def PicCanny(SouPic):
blurred = cv2.GaussianBlur(SouPic, (5, 5), 0)
edges = cv2.Canny(blurred, 50, 150)
return edges
#PIL转CV2
def PilToCV2(SouPic):
CV2 = cv2.cvtColor(np.asarray(SouPic), cv2.COLOR_RGB2BGR)
return CV2
#截取左、右下角
def Pic90(pic):
a = pic.height
b = pic.width
boxz = (0, a - 50, 200, a)
boxy = (b - 200, a - 50, b, a)
pz = pic.crop(boxz)
py = pic.crop(boxy)
return pz, py
KeyStr1 = './comicname/'
PartNum = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20]#所有分层情况
PicSave = "./{}".format(KeyStr1)
PicR = "./source/"#源文件位置
os.mkdir(PicSave)
biaozunshuiyin = "./exp.png"#标准jm水印
aaaaa = CountFiles(PicR)
for ccc in range(1, aaaaa + 1):
print("---------")
Picrd = "./Source/{:0>5d}.webp".format(ccc)
dfdf = "{}/{:0>5d}.png".format(PicSave, ccc)
ayt = Image.open(biaozunshuiyin)
aruy = PilToCV2(ayt)
aruy = PicCanny(aruy)#标准jm水印的边缘
Spicd = Image.open(Picrd)
PicCon = []
valcon = []
for i in PartNum:
Spic = ReCanvas(i, Spicd)#每页穷举
PicCon.append(Spic)
for i in PicCon:#计算左、右下角相似度
picz, picy = Pic90(i)
iii = PilToCV2(picz)
uuu = PilToCV2(picy)
ed1 = PicCanny(iii)
ed2 = PicCanny(uuu)
rrr = ssim(ed1, aruy)
bbb = ssim(ed2, aruy)
vvv = GetBig(rrr, bbb)
valcon.append(vvv)
print(vvv)
apapa = GetMaxIndex(valcon)
casi = PicCon[apapa]
casi.save(dfdf)#保存

标准jm水印
这样便能正确的还原jm的图片(但是我没法子大量实验,爬jm太多导致ip被封了) ,速度大概在280ms一张图左右。
不过我还是通过这个2.0版本的程序领悟到了点新东西,即用水印标定分层位置。
再采用CV2库里的匹配算法,通过获取水印位置来获得分页页数,极大减少了无用的循环,一张图还原时间大概为150~250ms,(v1.0版本的程序还原时间约为3秒)
import cv2
import numpy as np
import math
from PIL import Image
#获取正向分层距离
def GetPartHeight(Height, PartNum):
PartHeight = [0]
AvePartHeight = math.floor(Height / PartNum)
LastPartHeight = AvePartHeight + Height % PartNum
for i in range(1, PartNum):
PartHeight.append(AvePartHeight * i)
PartHeight.append(Height)
return PartHeight
#获取复原分层位置
def GetReCanvasPartHeight(Height, PartNum):
AvePartHeight = math.floor(Height / PartNum)
LastPartHeight = AvePartHeight + Height % PartNum
RePartHeight = [0, LastPartHeight]
for i in range(1, PartNum):
RePartHeight.append(LastPartHeight + AvePartHeight * i)
return RePartHeight
#复原
def ReCanvas(PartNum, InImage):
PartCon = []
Height = InImage.height
Width = InImage.width
HeightGroup = GetPartHeight(Height, PartNum)
ReHeightGroup = GetReCanvasPartHeight(Height, PartNum)
for i in range(PartNum):
PartCoor = (0, HeightGroup[i], Width, HeightGroup[ i + 1 ])
Part = InImage.crop(PartCoor)
PartCon.append(Part)
RePartCon = PartCon[::-1]
OutImage = Image.new( 'RGB', (Width, Height))
for i in range(PartNum):
OutImage.paste(RePartCon[i], (0, ReHeightGroup[i]))
return OutImage
vf = 'yourpic'
imgw = cv2.imread(vf, 0)#读取图片
#提取特征
blurred = cv2.GaussianBlur(imgw, (5, 5), 0)
img = cv2.Canny(blurred, 50, 150)
img2 = img.copy()
#下同,读取的是标准jm水印
templateq = cv2.imread('exp.png', 0)
blurred = cv2.GaussianBlur(templateq, (5, 5), 0)
template = cv2.Canny(blurred, 50, 150)
w, h = template.shape[::-1]
img = img2.copy()
res = cv2.matchTemplate(img, template, cv2.TM_CCOEFF)#匹配
min_val, max_val, min_loc, max_loc = cv2.minMaxLoc(res)
top_left = max_loc
hei = top_left[1]+h#获取水印底部高度(即向下取整的平均分层高度)
PartNum = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20]
imgese = Image.open(vf)
heit = imgese.height
for i in PartNum:
afc = math.floor(heit / i)#匹配对应的分层数
if afc == hei:
f = i
else:
continue
eee = ReCanvas(f, imgese)#还原
print(-t)
eee.show()
火山引擎开发者社区是火山引擎打造的AI技术生态平台,聚焦Agent与大模型开发,提供豆包系列模型(图像/视频/视觉)、智能分析与会话工具,并配套评测集、动手实验室及行业案例库。社区通过技术沙龙、挑战赛等活动促进开发者成长,新用户可领50万Tokens权益,助力构建智能应用。
更多推荐
所有评论(0)