关于多种验证码的应对方式-百木园

爬虫程序会遇到各色各样的验证码，整理一下解决方法。

1.使用均值哈希算法进行图像识别

原理是根据像素平均值对比，来得到一串01010001这样的字符串，通过比较相同位置上是否相同。

统计出来的数量作为相似度凭据。

适用于不规则，难以识别文字或字母时的图像对比，适用面不广，但思路可以借鉴。

代码如下：

#开发时间： 2022/11/2 20:42
import cv2
from PIL import Image
import numpy as np

#改这里！！！
path = r\"E:\\pic\\123\\8.jpg\"#要识别的主图像
path8 = r\"E:\\pic\\123\"#这里存放那0-7编号的八张图片所在文件夹地址，不需要加\\，如E:\\pic

#均值哈希算法
def aHash(img):
    image = cv2.imread(img)
    # 缩放为10*10
    img = cv2.resize(image, (10, 10))
    # 转换为灰度图
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    # s为像素和初值为0，hash_str为hash值初值为\'\'
    s = 0
    hash_str = \'\'
    # 遍历累加求像素和
    for i in range(10):
        for j in range(10):
            s = s + gray[i, j]
    # 求平均灰度
    avg = s / 100
    # 灰度大于平均值为1相反为0生成图片的hash值
    for i in range(10):
        for j in range(10):
            if gray[i, j] > avg:
                hash_str = hash_str + \'1\'
            else:
                hash_str = hash_str + \'0\'
    return hash_str

def cmpHash(hash1, hash2):
    n = 0
    # hash长度不同则返回-1代表传参出错
    if len(hash1)!=len(hash2):
        return -1
    # 遍历判断
    for i in range(len(hash1)):
        # 不相等则n计数+1，n最终为相似度
        if hash1[i] != hash2[i]:
            n = n + 1
    return n

def chanese2num(image1):
    hash1 = aHash(image1)
    n1 = 0
    for i in range(8):
        image2  = path8+ \"/\" +str(i) + \".jpg\"
        blackwhite(image2, 0)
        hash2 = aHash(image2)
        n = 100 - cmpHash(hash1, hash2)
        # print(n,i)#调试用
        if n >= n1:
            n1 = n
            num = i
    return num

def blackwhite(input_img_file,flag):
    \"\"\"
    转化为二值图像
    \"\"\"
    img = Image.open(input_img_file)
    # 模式L”为灰色图像，它的每个像素用8个bit表示，0表示黑，255表示白，其他数字表示不同的灰度。
    Img = img.convert(\'L\')
    table = []
    if flag == 0:
        #处理蓝底白字
        threshold = 180
        for i in range(256):
            if i < threshold:
                table.append(1)
            else:
                table.append(0)
        photo = Img.point(table, \'1\')
        photo.save(input_img_file)
    if flag == 1:
        #处理白底红字
        threshold = 228
        for i in range(256):
            if i < threshold:
                table.append(0)
            else:
                table.append(1)
        photo = Img.point(table, \'1\')
        photo.save(r\"E:\\pic\\123/test.jpg\")

def main():
    image = cv2.imread(path)
    image = np.array(image)
    frame1 = image[2:32, 140:211]#裁剪图像，必须在opencv的数组图像格式下才成立
    save_file = r\"E:\\pic\\123\\test.jpg\"
    # cv2.imshow(\'666\', image)
    cv2.imencode(\'.jpg\', frame1)[1].tofile(save_file)  # 防止出现乱码
    blackwhite(save_file, 1)  # 处理需要识别的并覆盖原图像
    num = chanese2num(save_file)
    return num
    # cv2.waitKey(0)

if __name__ == \'__main__\':
    num = main()
    print(f\"编号是{num}的图片是相似度最高的\")
    \"\"\"
    在爬虫程序里这样调用：
    my_crawler是你存放这个代码的文件夹
    
    from my_crawler import yanzhengma2
    if __name__ == \'__main__\':
        num = yanzhengma2.main()
        print(num)
        
    #num就是你需要的编号
    \"\"\"

2.使用pytesseract进行识别

我用来识别过文字，但效果很差劲，对于文字间隔，像素质量要求很严格。

整体来说效果不好

import pytesseract
import numpy as np
from PIL import Image

def threshold_By_OTSU(input_img_file):
    newimage = Image.open(input_img_file).convert(\'L\')  # 打开图片时候用convert(\'L\')方法转换为灰点
    newimage.save(r\"E:\\pic\\123/9.jpg\")
    print(pytesseract.image_to_string(newimage, lang=\'chi_sim\'))

file=r\'E:\\pic\\123\\8.jpg\'
threshold_By_OTSU(file)

这里使用时，遇到了opencv中读取图片，和PIL中Image读取图片的不同方式，之后再仔细研究一下使用规范

3.使用ddddocr进行识别

带带弟弟是一个非常强大的验证码识别工具，可以识别汉字，滑块，点击文字等许多种类的验证码

github源码：https://github.com/sml2h3/ddddocr

pypi官方：https://pypi.org/project/ddddocr/

对于文字识别，代码如下：

import os
import ddddocr
from PIL import Image
from io import BytesIO

ALLOW_FILE_TYPE = [\'jpg\', \'png\']
ocr = ddddocr.DdddOcr(show_ad = False)

def ocr_detect(path):
    \"\"\"
    OCR汉字识别
    \"\"\"
    fp, code = None, None

    if not os.path.exists(path): return code
    _, file_type = path.rsplit(\".\",1)

    if file_type.lower() not in ALLOW_FILE_TYPE: return  code
    try:
        fp = open(path, \'rb\')
        con1 = fp.read()
        img = Image.open(BytesIO(con1))
        code = ocr.classification(img)
    except Exception as exc:
        print(\'[ERROR] 识别发生错误:\', exc)
    finally:
        if fp: fp.close()
    return  code

if __name__ == \'__main__\':
    code = ocr_detect(r\'E:\\pic\\123\\test.jpg\')
    for i in range(8):
        code2 = ocr_detect(f\'E:\\pic/123/{i}.jpg\')
        if code == code2:
            print(f\"编码为{i}的是对的\")
        else:
            print(\"寄了\")

具体使用方式参考官方文档即可

更多方法用到后会继续更新~

来源：https://www.cnblogs.com/ningyuan233/p/16856787.html
本站部分图文来源于网络，如有侵权请联系删除。

关于多种验证码的应对方式

1.使用均值哈希算法进行图像识别

2.使用pytesseract进行识别

3.使用ddddocr进行识别

相关推荐

热门文章