博客
关于我
强烈建议你试试无所不能的chatGPT,快点击我
运用python抓取博客园首页的所有数据,而且定时持续抓取新公布的内容存入mongodb中...
阅读量:7113 次
发布时间:2019-06-28

本文共 4822 字,大约阅读时间需要 16 分钟。

原文地址:

依赖包:1.jieba2.pymongo3.HTMLParser# -*- coding: utf-8 -*-"""@author: jiangfuqiang"""from HTMLParser import  HTMLParserimport reimport timefrom datetime import  dateimport pymongoimport urllib2import sysimport tracebackimport jiebadefault_encoding = 'utf-8'if sys.getdefaultencoding() != default_encoding:    reload(sys)    sys.setdefaultencoding(default_encoding)isExist = Falseclass FetchCnblog(HTMLParser):    def __init__(self, id):        HTMLParser.__init__(self)        self.result = []        self.data = {}        self.isTitleLink = False        self.id = id        self.isSummary = False        self.isPostItem = False        self.isArticleView = False    def handle_data(self, data):        if self.isTitleLink and self.isPostItem:            self.data['title'] = data            self.isTitleLink = False        elif self.isSummary and self.isPostItem:            data = data.strip()            if data:                self.data['desc'] = data    def handle_starttag(self, tag, attrs):        if tag == 'a':            for key, value in attrs:                if key == 'class':                    if value == 'titlelnk':                        self.isTitleLink = True                    elif value == 'gray' and self.isArticleView:                        self.isArticleView = False                        for key, value in attrs:                            if key == 'href':                                self.data['readmoreLink'] = value                                reg = 'd+'                                result = re.search(reg,value)                                self.isPostItem = False                                if result:                                    self.data['id'] = int(result.group())                                else:                                    self.data = {}                                    return                                if self.data['id'] <= self.id:                                    self.data = {}                                    isExist = True                                    return                                else:                                    self.data['srouce'] = "www.cnblogs.com"                                    self.data['source_key'] = 'cnblogs'                                    self.data['fetchTime'] = str(date.today())                                    self.data['keyword'] = ",".join(jieba.cut(self.data['title']))                                    self.result.append(self.data)                                    self.data = {}        elif tag == 'p':            for key, value in attrs:                if key == 'class' and value == 'post_item_summary':                    self.isSummary = True        elif tag == 'img':            for key, value in attrs:                if key == 'class' and value == 'pfs':                    for key, value in attrs:                        if key == 'src':                            self.data['imgSrc'] = value        elif tag == 'div':            for key, value in attrs:                if key == 'class' and value == 'post_item_foot':                    self.isSummary = False                elif key == 'class' and value == 'post_item':                    self.isPostItem = True        elif tag == 'span':            for key , value in attrs:                if key == 'class' and value == 'article_view':                    self.isArticleView = True    def getResult(self):        return self.resultif __name__ == "__main__":    con = pymongo.Connection('localhost', 27017)    db = con.blog    fetchblog = db.fetch_blog    record = db.record    url = "http://www.cnblogs.com/sitehome/p/%d"    count = 1    flag = False    headers={             'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US。 rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}    reco = record.find_one({"type":'cnblogs'})    id = 0    if reco:        id = reco['maxId']    while isExist == False:        try:            req = urllib2.Request(url%count,headers=headers)            request = urllib2.urlopen(req)            data = request.read()            fj = FetchCnblog(id)            fj.feed(data)            result = fj.getResult()            if len(result) < 1:                isExist = True            else:                if flag == False:                    flag = True                    dic = result[0]                    id = int(dic['id'])                    record.update({"type":'cnblogs'},{"$set":{'maxId':id}},True,False)                result.reverse()                for doc in result:                    fetchblog.insert(doc)                print "page is %d"%count                count += 1                time.sleep(5)        except Exception, e:            traceback.print_exc()            print "parse error",e程序假设在linux,mac下运行。在可在crontab -e中设置定时任务,假设在windows运行,则自己再在程序里加个定时器就可以

你可能感兴趣的文章
分享几个Python小技巧函数里的4个小花招
查看>>
input file上传base编码图片及上传同一张图片
查看>>
爬虫网页解析之css用法及实战爬取中国校花网
查看>>
手拉手教你实现一门编程语言 Enkel, 系列 3
查看>>
JavaScript 复习之 String 对象
查看>>
面试技巧
查看>>
JS 中 'hello' 和 new String('hello') 引出的问题
查看>>
行高与字体的关系
查看>>
Android FragmentManager使用
查看>>
记一次移动端使用 rem 的兼容性问题
查看>>
区块链--共识算法POW
查看>>
JS中常用的8种跨域方式讲解
查看>>
Kotlin DSL 实战
查看>>
权力的游戏 第七季高清 BT 下载
查看>>
区块链开发 HSM技术
查看>>
GitHub排名TOP30的机器学习开源项目
查看>>
(译)使用Spring Boot和Axon实现CQRS&Event Sourcing
查看>>
node+express forever命令总结
查看>>
理解设计模式
查看>>
模型剖析 | 如何解决业务运维的四大难题?
查看>>