IT recording...

[Database] 4. 파이썬을 이용한 Data 크롤링 (BeautifulSoup 사용) 본문

Database

[Database] 4. 파이썬을 이용한 Data 크롤링 (BeautifulSoup 사용)

I-one 2021. 2. 8. 19:41

** 한글 인코딩 필요시 table, db 생성시 아래 옵션 추가하기

-- default charset=utf8 collate = utf8_bin

CREATE DATABASE bestproducts DEFAULT CHARSET=utf8 COLLATE=utf8_bin; 
CREATE TABLE tablename(field definitions) DEFAULT CHARSET=utf8 COLLATE=utf8_bin;

 

1. BeautifulSoup 사용한 크롤링

  • res = request.get('링크')
  • soup = BeautifulSoup(res.connect, 'html.parser')
  • 객체 = soup.select('태그')
import requests
from bs4 import BeautifulSoup

# 링크 연결, html.parser 붙여주기
res = requests.get('http://corners.gmarket.co.kr/Bestsellers')
soup = BeautifulSoup(res.connect, 'html.parser')

# categories에 해당 태그 값들 받아옴
categories = soup.select('div.gbest-cate ul.by-group li a')

* soup.select를 통해 접근 시 list로 받아와지고, 배열처럼 접근해야 하며

* soup.select_one을 통해 접근 시 .text()와 같은 함수 바로 사용이 가능하다

best_item = soup.select('div.best-list')

#enumerate를 통해 for문 index관리 가능

for index, item in enumerate(best_item[1].select('li')):
    data_dict = dict() # data를 dict라는 구조체로 묶어서 관리
    title = item.select_one('a.itemname').get_text()
    ori_price = item.select_one('div o-price span span')
    
    #...
    
    if ori_price == None or ori_price.get_text() =='':
    	ori_price = dis_price
    else:
    	ori_price = ori_price.get_Text().replace(',','').replace('원','')
    
    # ...
    data_dict['title'] = title
    data_dict['ori_price'] = ori_price
    # ...
    
    save_data(data_dict)

 

 

2. 저장하기

def save_data(item_info):
	#신규상품일때만 저장하게
    sql = """SELECT COUNT(*) FROM items WHERE item_code = '""" +item_info['item_code'] + """';"""
    cursor.execute(sql)
    result = cursor.fetchone()
    
    if result[0] == 0:
    	sql = """INSERT INTO items VALUES('""" + item_info['item_code'] + """',
        '""" + item_info['title'] + """',
        """ + str(item_info['ori_price']) + """,
        """ + str(item_info['dis_price']) + """,
        """ + str(item_info['discount_percent']) + """, 
        '""" + item_info['provider'] + """')"""
        cursor.execute(sql)
        
        sql = """INSERT INTO ranking (main_category, sub_category, item_ranking, item_code) VALUES('""" + item_info['category_name'] + """',
    	'""" + item_info['sub_category_name'] + """', 
    	'""" + str(item_info['ranking']) + """', 
    	'""" + item_info['item_code'] + """')"""     
    	cursor.execute(sql)

** sql구문 작성시 따옴표가 헷갈린다면

sql = """
    insert into items values(%s,%s,%s,%s,%s,%s)
    """ 
cursor.execute(sql, (item_info['item_code'],item_info['title'],str(item_info['ori_price']),
         str(item_info['dis_price']),str(item_info['discount_percent']),item_info['provider']))
                     
sql = """
	insert into items values(%s,%s,%s,%s,%s,%s)
    """
#data 따로 준비
data = ((item_info['item_code'],item_info['title'],str(item_info['ori_price']),
         str(item_info['dis_price']),str(item_info['discount_percent']),item_info['provider']))
         
cursor.execute(sql, data)
                  

참고 ) 

pythonstudy.xyz/python/article/203-MySQL-DML

 

예제로 배우는 파이썬 프로그래밍 - MySQL DML

1. MySQL DML SQL에서 데이타 조작하는 INSERT, UPDATE, DELETE 문을 DML(Data Manipulation Language)이라 한다. MySQL에서 INSERT, UPDATE, DELETE를 사용하는 일반적인 절차는 앞 아티클 (MySQL 쿼리)과 유사하다. MySQL에 DML

pythonstudy.xyz

dict 구조체

++ ) 

 

코.알.못. 마케터도 크롤링하기#4. BeautifulSoup으로 정보가져오기

설명 영상을 게시글 아래에 추가하였습니다. (updated 2021.01.15) html 구조를 살펴보았다면, 이제는 실제...

blog.naver.com

>전체코드<

더보기
import requests
from bs4 import BeautifulSoup
import pymysql

host_name = 'localhost'
host_port = 3306
username = 'root'
password = 'rjsrnreo11'
database_name = 'bestproducts'

#DB 연결
db = pymysql.connect(host=host_name, port=host_port, user=username, passwd=password, db=database_name, charset='utf8')
cursor = db.cursor()

# 메인 카테고리 받아오기
res = requests.get('http://corners.gmarket.co.kr/Bestsellers')
soup = BeautifulSoup(res.content, 'html.parser')

mainCate = soup.select('div.gbest-cate ul.by-group li a')
for main in mainCate:
    getSubCate('http://corners.gmarket.co.kr/'+main['href'], main.get_text())
    
db.commit()
db.close()

#==========================================================================================
def getSubCate(link, cateName):
    res = requests.get(link)
    soup = BeautifulSoup(res.content, 'html.parser')
    
    getItem(soup, cateName, "ALL")
    
    sub_categories = soup.select('div.navi ul li a')
    for sub in sub_categories:
        res = requests.get(sub['href'])
        soup = BeautilfulSoup(res.content, 'html.parser')
        getItem(soup, cateName, sub.get_text())
#     for sub in sub_categories:
#         print(cateName, sub.get_text(), 'http://corners.gmarket.co.kr/' + sub['href'])

#==========================================================================================
def getItem(soup, category_name, sub_category_name):
    item_list = list()
    best_item = soup.select('div.best-list')
    
    for index, item in enumerate(best_item[1].select('li')):
        data_dict = dict()
        ranking = index + 1
        title = item.select_one('a.itemname').get_text()
        ori_price = item.select_one('div.o-price span span')
        dis_price = item.select_one('div s-price strong span span')
        discount_percent = item.select_one('div s-price em')
        
        if ori_price == None or ori_price.get_text() == '':
            ori_price = dis_price
        if dis_price ==None or dis_price.get_text() =='':
            ori_price, dis_price = 0, 0
        else:
            ori_price = ori_price.get_text().replace(',','').replace('원','')
            dis_price = dis_price.get_text().replace(',','').replace('원','')
        if discount_percent == None or discount_percent == '':
            discount_percent = 0
        else:
            discount_percent = discount_percent.get_text().replace('%','')
            
        product_link = item.select_one('a.itemname')
        item_code = product_link.attrs['href'].split('=')[1]
        
        res = requests.get(product_link.attrs['href'])
        soup = BeautifulSoup(res.content, 'html.parser')
        
        provider = soup.select_one('span.text__seller a')
        if provider == None or provider =='':
            provider = ''
        else:
            provider = provider.get_text()
            
        data_dict['category_name'] = category_name
        data_dict['sub_category_name'] = sub_category_name
        data_dict['ranking'] = ranking
        data_dict['title'] = title
        data_dict['ori_price'] = ori_price
        data_dict['dis_price'] = dis_price
        data_dict['discount_percent'] = discount_percent
        data_dict['item_code'] = item_code
        data_dict['provider'] = provider
        
        print(data_dict)
        save_data(data_dict)
#==========================================================================================
def save_data(item_info):
    
    #신규 상품일 때만 저장하게
    sql = """SELECT COUNT(*) FROM items WHERE item_code = '""" + item_info['item_code'] + """';"""
    cursor.execute(sql)
    result = cursor.fetchone()
    
    if result[0] == 0:   
#         sql = """INSERT INTO items VALUES('""" + item_info['item_code'] + """',
#         '""" + item_info['title'] + """', 
#         """ + str(item_info['ori_price']) + """, 
#         """ + str(item_info['dis_price']) + """, 
#         """ + str(item_info['discount_percent']) + """, 
#         '""" + item_info['provider'] + """')"""
#         cursor.execute(sql)
    
        sql = """
        insert into items values(%s,%s,%s,%s,%s,%s)
        """ 
        cursor.execute(sql, (item_info['item_code'],item_info['title'],str(item_info['ori_price']),
                            str(item_info['dis_price']),str(item_info['discount_percent']),item_info['provider']))
                                                                                        
    sql = """INSERT INTO ranking (main_category, sub_category, item_ranking, item_code) VALUES('""" + item_info['category_name'] + """',
    '""" + item_info['sub_category_name'] + """', 
    '""" + str(item_info['ranking']) + """', 
    '""" + item_info['item_code'] + """')"""     
    cursor.execute(sql)
Comments