리눅스

공공데이터 포탈 데이터 목록 및 내용 크롤링

LIMMI 2024. 5. 3. 10:01

디비 연결 후 삽입까지

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
from selenium import webdriver
import requests
import time
import sys
import pandas as pd

#오라클 디비 연결
import oracledb
con = oracledb.connect(user="TEST",password="test",dsn="localhost:1521/orcl")
cursor = con.cursor()
print("!! connection complete !!")
url = 'https://www.data.go.kr/tcs/dss/selectDataSetList.do?dType=FILE&keyword=&operator=AND&detailKeyword=&publicDataPk=&recmSe=&detailText=&relatedKeyword=&commaNotInData=&commaAndData=&commaOrData=&must_not=&tabId=&dataSetCoreTf=&coreDataNm=&sort=&relRadio=&orgFullName=%EB%86%8D%EB%A6%BC%EC%B6%95%EC%82%B0%EC%8B%9D%ED%92%88%EB%B6%80&orgFilter=%EB%86%8D%EB%A6%BC%EC%B6%95%EC%82%B0%EC%8B%9D%ED%92%88%EB%B6%80&org=%EB%86%8D%EB%A6%BC%EC%B6%95%EC%82%B0%EC%8B%9D%ED%92%88%EB%B6%80&orgSearch=&currentPage=1&perPage=400&brm=&instt=&svcType=&kwrdArray=&extsn=&coreDataNmArray=&pblonsipScopeCode=#'

#드라이버 연결
#자동꺼짐방지
chrome_options = Options()
chrome_options.add_experimental_option("detach", True)
driver = webdriver.Chrome(options=chrome_options)

# 웹사이트 이동
driver.get(url)
driver.implicitly_wait(2)
titles=[]
#href 추출
links = driver.find_elements(By.CSS_SELECTOR,"#fileDataList > div.result-list > ul > li > dl > dt > a")
keys=[]
values=[]
titles=[]
hrefs=[]

for j in links[39:]:
    href = j.get_attribute("href")              
    driver.get(href)
    print(href)
    driver.implicitly_wait(6)
    table = driver.find_element(By.CLASS_NAME,"file-meta-table-pc")

#태그이름과 Xpath로 찾기

    title = driver.find_element(By.XPATH,'//*[@id="contents"]/div[2]/div[1]/div[1]/p')
    row = table.find_elements(By.TAG_NAME,"th")
    data = table.find_elements(By.TAG_NAME,"td")
           
    print('Rows --> {}'.format(len(row)))
    print('Data --> {}'.format(len(data)))
    # print('title -->'+ title.text)
    print('title -->'+ title.text)
   
    driver.implicitly_wait(6)

    for key,value in zip(row,data):
        # 출력확인
        # print(href,'+', key.text,'/', value.text)
        keys.append(key.text)
        #list 추가
        values.append(value.text)
        titles.append(title.text)
        hrefs.append(href)
        driver.implicitly_wait(6)
        #DB에 값 삽입
        sql_insert = 'insert into lim (title,href,keys,value) values(:title,:href,:keys,:value)'
        cursor.execute(sql_insert, title=title.text.encode('utf8').decode('utf8'), href=href.encode('utf8').decode('utf8'), keys=key.text.encode('utf8').decode('utf8'), value=value.text.encode('utf8').decode('utf8'))
        con.commit()
    driver.back()
   





from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
from selenium import webdriver
import requests
import time
import sys
import pandas as pd


#오라클 디비 연결
import oracledb
con = oracledb.connect(user="TEST",password="test",dsn="localhost:1521/orcl")
cursor = con.cursor()
print("!! connection complete !!")
#드라이버 연결
#자동꺼짐방지
chrome_options = Options()
chrome_options.add_experimental_option("detach", True)
driver = webdriver.Chrome(options=chrome_options)

#접속
driver.get(url)
driver.implicitly_wait(2)
table = driver.find_element(By.CLASS_NAME,"file-meta-table-pc")

#태그이름으로 찾기
title = driver.find_element(By.XPATH,'//*[@id="contents"]/div[2]/div[1]/div[1]/p')
row = table.find_elements(By.TAG_NAME,"th")
data = table.find_elements(By.TAG_NAME,"td")
# title = driver.find_elements(By.CLASS_NAME, "tit")
print('Rows --> {}'.format(len(row)))
print('Data --> {}'.format(len(data)))
# print('title -->'+ title )
for key,value in zip(row,data):        
    # print(url,'+', key.text,'/', value.text)
    driver.implicitly_wait(2)
    driver.implicitly_wait(7)
    sql_insert = 'insert into lim (title,href,keys,value) values(:title,:href,:keys,:value)'
    cursor.execute(sql_insert, title=title.text.encode('utf8').decode('utf8'), href=url.encode('utf8').decode('utf8'), keys=key.text.encode('utf8').decode('utf8'), value=value.text.encode('utf8').decode('utf8'))
    con.commit()