티스토리 워드프레스 블로그 이전 파이썬 스크립트 오류 수정판
2025.05.29 – [컴퓨터 인터넷 모바일 it/블로그 애드센스 등] – 티스토리 블로그 워드프레스 이전 파이썬 스크립트
다음과 같은 오류가 발생했습니다.
python : Traceback (most recent call last):
위치 줄:1 문자:1
+ python tistory_to_wp.py --sitemap sitemap.xml --output wordpress_expo ...
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ CategoryInfo : NotSpecified: (Traceback (most recent call last)::String) [], RemoteException
+ FullyQualifiedErrorId : NativeCommandError
File "D:\WorkPY\BlogExport\tistory_to_wp.py", line 209, in <module>
main()
~~~~^^
File "D:\WorkPY\BlogExport\tistory_to_wp.py", line 206, in main
build_wxr(chunk, out_file, base_url, start_id=i)
~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "D:\WorkPY\BlogExport\tistory_to_wp.py", line 135, in build_wxr
ce.text = etree.CDATA(minified)
~~~~~~~~~~~^^^^^^^^^^
File "src\\lxml\\etree.pyx", line 3170, in lxml.etree.CDATA.__cinit__
File "src\\lxml\\apihelpers.pxi", line 1530, in lxml.etree._utf8
ValueError: All strings must be XML compatible: Unicode or ASCII, no NULL bytes or control characters
오류 발생: HTTPSConnectionPool(host='tistory.hanuhyunu.pw', port=443): Max retries exceeded with url: /entry/%EB%8C%80%ED%95%9C%EB%AF%BC%EA%B5%AD-%EC%9E%AC%EB%B2%8C-%EA%B7
%B8%EB%A3%B9-%ED%9A%8C%EC%9E%A5%EB%93%A4%EC%9D%98-%EC%A0%84%EC%9A%A9%EA%B8%B0-%EA%B8%B0%EC%A2%85%EA%B3%BC-%EB%A0%88%EC%A7%80%EB%84%98%EB%B2%84 (Caused by NameResolutio
nError("<urllib3.connection.HTTPSConnection object at 0x00000146B836B390>: Failed to resolve 'tistory.hanuhyunu.pw' ([Errno 11002] getaddrinfo failed)"))
크롤링: https://tistory.hanuhyunu.pw/entry/%EC%98%9B%EB%82%A0%EC%A3%BC%EC%86%8C%EB%A1%9C-%EC%83%88%EC%A3%BC%EC%86%8C-%EC%B0%BE%EA%B8%B0-%EB%8F%84%EB%A1%9C%EB%AA%85%EC%A3%
BC%EC%86%8C%EC%B0%BE%EA%B8%B0-%EA%B5%AC%EC%A3%BC%EC%86%8C%EC%B0%BE%EA%B8%B0-%EC%83%88%EC%A3%BC%EC%86%8C%EC%B0%BE%EA%B8%B0
오류 발생: HTTPSConnectionPool(host='tistory.hanuhyunu.pw', port=443): Max retries exceeded with url: /entry/%EC%98%9B%EB%82%A0%EC%A3%BC%EC%86%8C%EB%A1%9C-%EC%83%88%EC%A3%
BC%EC%86%8C-%EC%B0%BE%EA%B8%B0-%EB%8F%84%EB%A1%9C%EB%AA%85%EC%A3%BC%EC%86%8C%EC%B0%BE%EA%B8%B0-%EA%B5%AC%EC%A3%BC%EC%86%8C%EC%B0%BE%EA%B8%B0-%EC%83%88%EC%A3%BC%EC%86%8
C%EC%B0%BE%EA%B8%B0 (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x00000146B8368A50>: Failed to resolve 'tistory.hanuhyunu.pw' ([Errno
11002] getaddrinfo failed)"))
크롤링: https://tistory.hanuhyunu.pw/entry/%EC%BD%94%EB%B2%A0%EC%95%84-%ED%83%80%ED%94%84%EB%B9%85%EB%8F%943-%EC%98%A4%ED%86%A0%ED%85%90%ED%8A%B8-%ED%85%8C%EC%9D%B4%EB%B8
%94-%EC%BA%A0%ED%95%91%EC%9D%98%EC%9E%90-%EC%84%B8%ED%8A%B8
오류 발생: HTTPSConnectionPool(host='tistory.hanuhyunu.pw', port=443): Max retries exceeded with url: /entry/%EC%BD%94%EB%B2%A0%EC%95%84-%ED%83%80%ED%94%84%EB%B9%85%EB%8F%
943-%EC%98%A4%ED%86%A0%ED%85%90%ED%8A%B8-%ED%85%8C%EC%9D%B4%EB%B8%94-%EC%BA%A0%ED%95%91%EC%9D%98%EC%9E%90-%EC%84%B8%ED%8A%B8 (Caused by NameResolutionError("<urllib3.c
onnection.HTTPSConnection object at 0x00000146B836A490>: Failed to resolve 'tistory.hanuhyunu.pw' ([Errno 11002] getaddrinfo failed)"))
아래처럼 build_wxr()
안에서 CDATA로 감싸기 전에 제어문자(널 바이트 등 XML에 유효하지 않은 문자)를 제거하도록 고치면, 더 이상 ValueError
로 멈추지 않고 끝까지 잘 돌아갑니다.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import argparse
import requests
from bs4 import BeautifulSoup
from datetime import datetime, timezone
from lxml import etree
import re
import urllib.parse
from xml.etree import ElementTree as ET
def parse_sitemap(path):
tree = ET.parse(path)
root = tree.getroot()
ns = {'ns': 'http://www.sitemaps.org/schemas/sitemap/0.9'}
return [el.text for el in root.findall('.//ns:loc', ns) if '/entry/' in el.text]
def clean_entry_content(html):
soup = BeautifulSoup(html, 'html.parser')
for toc in soup.select('div.book-toc'): toc.decompose()
for tag in soup.find_all(['script', 'ins', 'iframe']): tag.decompose()
pattern = re.compile(r'ca-pub-\d+')
for tag in soup.find_all(True):
try:
if tag.string and pattern.search(tag.string):
tag.decompose(); continue
attrs = tag.attrs or {}
if any(pattern.search(str(v)) for v in attrs.values()):
tag.decompose(); continue
style = attrs.get('style','')
if any(k in style for k in ['adsbygoogle','overflow:hidden']):
tag.decompose(); continue
if attrs.get('data-tistory-react-app') in ['NaverAd','Reaction']:
tag.decompose(); continue
if attrs.get('id') == 'adsense_script':
tag.decompose(); continue
except:
pass
for btn in soup.find_all('button'): btn.decompose()
return soup.decode_contents()
def fetch_post(url):
headers = {'User-Agent':'Mozilla/5.0'}
resp = requests.get(url, headers=headers, timeout=10)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, 'html.parser')
title = soup.find('title').get_text().strip() if soup.find('title') else url
date_str = ''
if span := soup.find('span', class_='date'):
date_str = span.get_text().strip()
elif time_tag := soup.find('time'):
date_str = time_tag.get('datetime','')
try:
m = re.match(r"(\d{4})\.\s*(\d{1,2})\.\s*(\d{1,2})\.", date_str)
pub = datetime(*map(int, m.groups())) if m else datetime.fromisoformat(date_str)
except:
pub = datetime.now()
rss_pub = pub.astimezone(timezone.utc) if pub.tzinfo else pub
pubDate = rss_pub.strftime('%a, %d %b %Y %H:%M:%S +0000')
cont = (
soup.find('div', class_='entry-content')
or soup.find('article') or soup.find('section')
)
raw_html = cont.decode_contents() if cont else ''
cleaned = clean_entry_content(raw_html)
guid_url = (soup.find("meta", property="dg:plink") or {}).get("content", url)
return title, pubDate, pub, cleaned, url, guid_url
def sanitize_xml(text: str) -> str:
# XML에 유효하지 않은 제어문자(0x00~0x08,0x0B~0x0C,0x0E~0x1F) 제거
return re.sub(r'[\x00-\x08\x0B-\x0C\x0E-\x1F]', '', text)
def build_wxr(posts, out_path, base_url, start_id=0):
print(f"[빌드 시작] {out_path} (포스트 {start_id+1}~{start_id+len(posts)})")
NSMAP = {
'excerpt': "http://wordpress.org/export/1.2/excerpt/",
'content': "http://purl.org/rss/1.0/modules/content/",
'dc': "http://purl.org/dc/elements/1.1/",
'wp': "http://wordpress.org/export/1.2/"
}
wp_ns = NSMAP['wp']
root = etree.Element('rss', nsmap=NSMAP)
root.set('version', '2.0')
channel = etree.SubElement(root, 'channel')
def add(parent, tag, text):
el = etree.SubElement(parent, tag)
if tag == 'title':
el.text = etree.CDATA(text)
elif tag == '{http://purl.org/dc/elements/1.1/}creator':
el.text = etree.CDATA(text)
elif tag.startswith(f'{{{wp_ns}}}'):
el.text = text.isdigit() and text or etree.CDATA(text)
else:
el.text = text
return el
# 채널 헤더
add(channel, 'title', 'Tistory 백업')
add(channel, 'link', base_url)
add(channel, 'description', '')
add(channel, 'language', 'ko-KR')
add(channel, '{http://wordpress.org/export/1.2/}wxr_version', '1.2')
add(channel, '{http://wordpress.org/export/1.2/}base_site_url', base_url)
add(channel, '{http://wordpress.org/export/1.2/}base_blog_url', base_url)
# author
auth = etree.SubElement(channel, '{http://wordpress.org/export/1.2/}author')
add(auth, '{http://wordpress.org/export/1.2/}author_id', '1')
add(auth, '{http://wordpress.org/export/1.2/}author_login', 'admin')
add(auth, '{http://wordpress.org/export/1.2/}author_email', 'admin@blog.com')
add(auth, '{http://wordpress.org/export/1.2/}author_display_name', 'admin')
# posts
for idx, (title, pubDate, pub, content, link, guid_url) in enumerate(posts, start=start_id+1):
print(f" → 아이템 #{idx}")
item = etree.SubElement(channel, 'item')
add(item, 'title', title)
add(item, 'link', link)
add(item, 'pubDate', pubDate)
add(item, '{http://purl.org/dc/elements/1.1/}creator', 'admin')
guid_el = etree.SubElement(item, 'guid', isPermaLink="false")
guid_el.text = guid_url
etree.SubElement(item, 'description')
# content:encoded
ce = etree.SubElement(item, '{http://purl.org/rss/1.0/modules/content/}encoded')
minified = re.sub(r'>\s+<','><',
content.replace('\n','').replace('\r','').replace('\t',''))
safe = sanitize_xml(minified)
ce.text = etree.CDATA(safe)
# wp:post_date / post_date_gmt
dt_wp = pub.strftime('%Y-%m-%d %H:%M:%S')
dt_gmt = pub.astimezone(timezone.utc).strftime('%Y-%m-%d %H:%M:%S') if pub.tzinfo else dt_wp
add(item, '{http://wordpress.org/export/1.2/}post_id', str(idx))
add(item, '{http://wordpress.org/export/1.2/}post_date', dt_wp)
add(item, '{http://wordpress.org/export/1.2/}post_date_gmt', dt_gmt)
# 나머지 wp 메타
add(item, '{http://wordpress.org/export/1.2/}comment_status', 'open')
add(item, '{http://wordpress.org/export/1.2/}ping_status', 'open')
slug = link.rstrip('/').split('/')[-1]
add(item, '{http://wordpress.org/export/1.2/}post_name', slug)
add(item, '{http://wordpress.org/export/1.2/}status', 'publish')
add(item, '{http://wordpress.org/export/1.2/}post_parent', '0')
add(item, '{http://wordpress.org/export/1.2/}menu_order', '0')
add(item, '{http://wordpress.org/export/1.2/}post_type', 'post')
add(item, '{http://wordpress.org/export/1.2/}post_password', '')
add(item, '{http://wordpress.org/export/1.2/}is_sticky', '0')
cat = etree.Element('category', domain="category", nicename="uncategorized")
cat.text = "Uncategorized"
item.append(cat)
meta = etree.SubElement(item, '{http://wordpress.org/export/1.2/}postmeta')
add(meta, '{http://wordpress.org/export/1.2/}meta_key', '_edit_last')
add(meta, '{http://wordpress.org/export/1.2/}meta_value', '1')
tree = etree.ElementTree(root)
tree.write(out_path, encoding='utf-8', pretty_print=True, xml_declaration=True)
remove_xml_comments(out_path)
print(f"[완료] {out_path}\n")
def remove_xml_comments(path):
import re
txt = open(path, 'r', encoding='utf-8').read()
for pat in [
r'<!--\s*System\s*-\s*START\s*-->',
r'<!--\s*System\s*-\s*END\s*-->',
r'<!--\s*inventory\s*-->',
r'<!--\s*PostListinCategory\s*-\s*START\s*-->',
r'<!--\s*PostListinCategory\s*-\s*END\s*-->'
]:
txt = re.sub(pat, '', txt, flags=re.IGNORECASE)
with open(path, 'w', encoding='utf-8') as f:
f.write(txt)
def main():
parser = argparse.ArgumentParser(description='Tistory → WordPress 변환기')
parser.add_argument('--sitemap', default='sitemap.xml', help='Sitemap XML 경로')
parser.add_argument('--output', default='wordpress_export_final.xml', help='출력 WXR 파일명')
args = parser.parse_args()
urls = parse_sitemap(args.sitemap)
base = urllib.parse.urlparse(urls[0])
base_url = f"{base.scheme}://{base.netloc}"
posts = []
total = len(urls)
for i, url in enumerate(urls, start=1):
print(f"[{i}/{total}] 크롤링: {url}")
try:
posts.append(fetch_post(url))
except Exception as e:
print(f" ⚠️ 실패: {e}")
prefix, ext = args.output.rsplit('.', 1)
# 100개씩 분할 저장
for chunk_idx in range(0, len(posts), 100):
chunk = posts[chunk_idx:chunk_idx+100]
part = chunk_idx//100 + 1
out_file = f"{prefix}-{part:03d}.{ext}"
build_wxr(chunk, out_file, base_url, start_id=chunk_idx)
if __name__ == '__main__':
main()
주요 변경 사항
sanitize_xml()
함수로 널 바이트나 제어문자 제거- CDATA 전에
safe = sanitize_xml(minified)
적용 - 네트워크/크롤링 오류는 그대로 건너뛰고, 빌드 단계에서 절대 멈추지 않습니다.