博客园固定分类文章抓取

import pymysql
import requests
import re
import random
import time
import json
import html
import codecs

conn = pymysql.connect(host='127.0.0.1', user='root', password='123456', database='lyj_xtblog', charset='utf8')
cursor = conn.cursor()

pattern = '<article class="post-item">(.*?\s)</article>'
re_url = '<a class="post-item-title" href="(.*?)" target="_blank">.*?</a>'
re_title = '<a class="post-item-title" href=".*?" target="_blank">(.*?)</a>'
re_desc = '<p class="post-item-summary">(.*?)</p>'
re_addtime = '<span class="post-meta-item">(.*?)</span>'

data = {}
i = 1
table = 'lz_diary'

re_tag = re.compile('<img .*? />')

while i < 79:
    if i == 0:
        url = 'https://www.cnblogs.com/cate/php/'
    else:
        url = 'https://www.cnblogs.com/cate/php/'+str(i)
    strhtml = requests.get(url)  # get方式获取网页数据
    list = re.compile(pattern, re.S | re.M).findall(strhtml.text)

    # 获取日期
    for value in list:
        url = re.findall(re_url, value)  # 详情页url地址
        title = re.findall(re_title, value)

        desc = re.compile(re_desc, re.S | re.M).findall(value)
        desc = re.sub(re_tag, "", desc[0]).strip()

        addtime = re.findall(re_addtime, value)
        print(value)
        print(addtime)
        exit()

        data['addtime'] = list1[0].strip()
        data['title'] = title[0]
        if desc1:
            data['describes'] = desc1[0]
        else:
            data['describes'] = desc[0]
        # 获取详情页数据
        url_html = requests.get(url[0])
        content = re.compile(re_details, re.S | re.M).findall(url_html.text)
        #data['content'] = json.dumps(content[0])
        data['content'] = html.escape(content[0])
        data['click'] = random.randint(100,999)
        #data['addtime'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
        data['source_type'] = 2
        keys = ','.join(data.keys())
        values = ','.join(['%s'] * len(data))
        sql = "INSERT INTO {table}({keys}) VALUES ({values})".format(table=table, keys=keys, values=values)
        cursor.execute(sql,tuple(data.values()))
        conn.commit()
    i += 1

标签

发表评论