엑셀 상의 URL을 긁어서 이메일을 뽑은 뒤 엑셀 해당 행의 마지막에 긁은 이메일을 추가해주는 코드

나의 재물운? 연애운은?

AI가 봐주는 내 사주팔자 - 운세박사

https://aifortunedoctor.com/

엑셀 상의 URL을 긁어서 이메일을 뽑은 뒤 엑셀 해당 행의 마지막에 긁은 이메일을 추가해주는 코드

영리치 0 924 2023.02.28 16:29


# import pandas as pd
# import requests
# from bs4 import BeautifulSoup
# import re
# from urllib.parse import urlparse

# # df = pd.read_excel('voucher.xlsx')
# dfs = pd.read_excel('voucher.xlsx', sheet_name=None)

# # loop through each row
# for index, row in dfs['Sheet1'].iterrows():
# url = row[5] # get website URL from 8th column
# try:
# result = urlparse(url)

# response = requests.get(f"http://{url}", timeout=8) # send request to website
# html_content = response.text
# if response.status_code == 200:
# # Process the response
# pass
# else:
# print(f"Failed to retrieve {url}: {response.status_code}")
# # print("#####", response)
# # soup = BeautifulSoup(response.content, 'html.parser') # parse HTML content
# email_regex = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b' # regex pattern to match email addresses
# # email_addresses = [] # list to store email addresses
# # for a in soup.find_all('a', href=True):
# # if re.match(email_regex, a['href']):
# # email_addresses.append(a['href'])
# emails = re.findall(email_regex, html_content)
# print(f"#### {emails}")
# # save email addresses in separate file or append to existing DataFrame
# if len(emails) > 0:
# with open('sheet1.txt', 'a') as f:
# for email in emails:
# f.write(email + '\n')
# except requests.exceptions.Timeout:
# print(f"TIME OUT: {url}")
# except requests.exceptions.RequestException as e:
# print(f"Error retrieving {url}: {e}")
# except ValueError as e:
# print(f"Invalid URL: {url}")

import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
from urllib.parse import urlparse
import openpyxl

# df = pd.read_excel('voucher.xlsx')
dfs = pd.read_excel('voucher.xlsx', sheet_name=None)
# wb = openpyxl.load_workbook('filename.xlsx')
# sheet = wb['Sheet2']
df_new = pd.DataFrame()

# loop through each row
for index, row in dfs['Sheet1'].iterrows():
# for row in sheet.iter_rows(min_row=2, values_only=True):
url = row[5] # get website URL from 8th column
try:
result = urlparse(url)

response = requests.get(f"http://{url}", timeout=5) # send request to website
html_content = response.text
if response.status_code == 200:
# Process the response
pass
else:
print(f"Failed to retrieve {url}: {response.status_code}")
# print("#####", response)
# soup = BeautifulSoup(response.content, 'html.parser') # parse HTML content
email_regex = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b' # regex pattern to match email addresses
# email_addresses = [] # list to store email addresses
# for a in soup.find_all('a', href=True):
# if re.match(email_regex, a['href']):
# email_addresses.append(a['href'])
emails = re.findall(email_regex, html_content)
print(f"#### {emails}")
# save email addresses in separate file or append to existing DataFrame
if len(emails) > 0:
row['Email'] = ';'.join(emails)
print(f"$$$$$${row}")
with open('Email_sheet1.txt', 'a') as f:
for email in emails:
f.write(email + '\n')
except requests.exceptions.Timeout:
print(f"TIME OUT: {url}")
except requests.exceptions.RequestException as e:
print(f"Error retrieving {url}: {e}")
except ValueError as e:
print(f"Invalid URL: {url}")
df_new = df_new.append(row)

df_new.to_excel("Email_Sheet1.xlsx", index=False)


새로운 엑셀 파일을 생성해준다.

Comments

나의 재물운? 연애운은?

AI가 봐주는 내 사주팔자 - 운세박사

https://aifortunedoctor.com/

Category
실시간 인기글
Magazine
훈남/훈녀
 
 
 
상점
Facebook Twitter GooglePlus KakaoStory NaverBand