使用的BeautifulSoup库,比较小白,对于中文,有的页面需要添加from_encoding = "GBK", 还有数据库的各种转码,比如con.escape_string(),这个问题卡了一下午+一晚上,还是在StackOverFlow上撞到的解决方案, 以腾讯新闻为例:

    # coding: utf-8
    # 文章信息保存到details_list中,包括标题,作者,发布时间,摘要,内容, 原地址
    # /usr/bin/python
    # author: fish
     
    import sys
    reload(sys)
    sys.setdefaultencoding('utf-8')
    from bs4 import BeautifulSoup
    import urllib2
    import urllib
    import socket
    import re
    import MySQLdb
    url = 'http://tech.qq.com'
    socket.setdefaulttimeout(200)
    soup = BeautifulSoup(urllib.urlopen(url), from_encoding = 'GBK')
    #print soup
    href_list = []
    title = soup.find_all('div', 'Q-tpListInner')
    #print title
    for detail_href in title:
        try:
            href_list.append(detail_href.a.get('href'))
        except:
            AttributeError
    details_list = []
    #print href_list
    '''connect to mysql'''
    try:
        con = MySQLdb.connect(host = 'localhost', user = 'root', passwd = 'baidusql', charset = 'utf8')
        con.select_db('ali_app')
        cur = con.cursor()
     
     
     
        for href in href_list:
            sub_details_list = []
            detail_soup = BeautifulSoup(urllib.urlopen(href).read(), from_encoding = 'GBK')
            print href
     
            try:
                article_title = detail_soup.find('div', 'hd').h1.string
                article_pub_date = detail_soup.find('span', 'pubTime').string
                article_author = detail_soup.find('span', 'auth').string
                if str(article_author) == 0:
                    article_author = '腾讯科技'
                article_abridgement = detail_soup.find(attrs = {
                        'name': 'Description'
                        }).get('content')
     
                article_contents = detail_soup.find('div', id = 'Cnt-Main-Article-QQ')
                article_source_address = href
                if(str(article_contents).find('videoPlayer') != -1 or str(detail_soup).find('gqMaskshowBT') != -1):
                    continue
            except:
                AttributeError
             
            sub_details_list.append(article_title)
            sub_details_list.append(article_author)
            sub_details_list.append(article_pub_date)
            sub_details_list.append(article_source_address)
            sub_details_list.append(article_abridgement)
            sub_details_list.append(article_contents)
             
    #       cur.execute('drop table if exists QQ_TECH')
     
    #       cur.execute("insert into QQ_TECH(title, autor, pub_date, source_address, description, content) values(%s, %s, %s, %s, %s, %s)", sub_details_list)
    #       cur.execute("insert into QQ_TECH(title, autor ,pub_date, source_address, description, content ) values('fdsaf', 'fds', '发范德萨', 'fdsafdfdksajfwefdsdsa放到', 'fdsafewr3', '范德萨范德萨定时分尸案')")
    #   print sub_details_list
            cur.execute('delete from jr_category where source_address = "%s"' %article_source_address)
            cur.execute('delete from jr_content where source_address = "%s"' %article_source_address)
            cur.execute('insert into jr_category(sid, title, date, author, source_address) values(3, "%s", "%s",  "%s", "%s")' %(article_title, article_pub_date, article_author, article_source_address))
            article_contents = con.escape_string(str(article_contents))
            cur.execute("insert into jr_content(source_address, description, content) values('%s', '%s', '%s')" %(article_source_address, article_abridgement, article_contents))
             
    #       content = con.escape_string(str(article_contents))
    #       cur.execute('''insert into test2(name) values("%s")'''  %content)
     
        details_list.append(sub_details_list) 
        con.commit()
        cur.close()
        con.close()
    except MySQLdb.Error, e:
        print "Error %d: %s" %(e.args[0], e.args[1])


blog comments powered by Disqus

Published

08 February 2014

Tags