엑셀 상의 URL을 긁어서 이메일을 뽑은 뒤 엑셀 해당 행의 마지막에 긁은 이메일을 추가해주는 코드

영리치 0 924 2023.02.28 16:29

# import pandas as pd
# import requests
# from bs4 import BeautifulSoup
# import re
# from urllib.parse import urlparse

# # df = pd.read_excel('voucher.xlsx')
# dfs = pd.read_excel('voucher.xlsx', sheet_name=None)

# # loop through each row
# for index, row in dfs['Sheet1'].iterrows():
#     url = row[5] # get website URL from 8th column
#     try: 
#         result = urlparse(url)

#         response = requests.get(f"http://{url}", timeout=8) # send request to website
#         html_content = response.text
#         if response.status_code == 200:
#             # Process the response
#             pass
#         else:
#             print(f"Failed to retrieve {url}: {response.status_code}")
#         # print("#####", response)
#         # soup = BeautifulSoup(response.content, 'html.parser') # parse HTML content
#         email_regex = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b' # regex pattern to match email addresses
#         # email_addresses = [] # list to store email addresses
#         # for a in soup.find_all('a', href=True):
#         #     if re.match(email_regex, a['href']):
#         #         email_addresses.append(a['href'])
#         emails = re.findall(email_regex, html_content)
#         print(f"#### {emails}")
#         # save email addresses in separate file or append to existing DataFrame
#         if len(emails) > 0:
#             with open('sheet1.txt', 'a') as f:
#                 for email in emails:
#                     f.write(email + '\n')
#     except requests.exceptions.Timeout:
#         print(f"TIME OUT: {url}")
#     except requests.exceptions.RequestException as e:
#         print(f"Error retrieving {url}: {e}")      
#     except ValueError as e:
#         print(f"Invalid URL: {url}")
        

import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
from urllib.parse import urlparse
import openpyxl

# df = pd.read_excel('voucher.xlsx')
dfs = pd.read_excel('voucher.xlsx', sheet_name=None)
# wb = openpyxl.load_workbook('filename.xlsx')
# sheet = wb['Sheet2']
df_new = pd.DataFrame()

# loop through each row
for index, row in dfs['Sheet1'].iterrows():
# for row in sheet.iter_rows(min_row=2, values_only=True):
    url = row[5] # get website URL from 8th column
    try: 
        result = urlparse(url)

        response = requests.get(f"http://{url}", timeout=5) # send request to website
        html_content = response.text
        if response.status_code == 200:
            # Process the response
            pass
        else:
            print(f"Failed to retrieve {url}: {response.status_code}")
        # print("#####", response)
        # soup = BeautifulSoup(response.content, 'html.parser') # parse HTML content
        email_regex = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b' # regex pattern to match email addresses
        # email_addresses = [] # list to store email addresses
        # for a in soup.find_all('a', href=True):
        #     if re.match(email_regex, a['href']):
        #         email_addresses.append(a['href'])
        emails = re.findall(email_regex, html_content)
        print(f"#### {emails}")
        
        # save email addresses in separate file or append to existing DataFrame
        if len(emails) > 0:
            row['Email'] = ';'.join(emails)
            
            print(f"$$$$$${row}")
            
            with open('Email_sheet1.txt', 'a') as f:
                for email in emails:
                    f.write(email + '\n')
    except requests.exceptions.Timeout:
        print(f"TIME OUT: {url}")
    except requests.exceptions.RequestException as e:
        print(f"Error retrieving {url}: {e}")      
    except ValueError as e:
        print(f"Invalid URL: {url}")
    df_new = df_new.append(row)
    

df_new.to_excel("Email_Sheet1.xlsx", index=False)

새로운 엑셀 파일을 생성해준다.

Comments

[인터뷰] 국내 게임 대기업 PD님 인터뷰 (업계 최고의 PD 양성자)

100억 이상 들어간 게임은 성공 확률이 5% 정도이다.일반적인 인디 게임을 포함해서 게임 제작의 성공 확률은 1%도 안될 것 같다게임은 정말 성공 확률이 굉장히 낮은 비즈니스이다.대신, 성공하면 크게 먹는다누구나 …

영리치 | 02.24