티스토리 워드프레스 블로그 이전 파이썬 스크립트 오류 수정판

티스토리 워드프레스 블로그 이전 파이썬 스크립트 오류 수정판

2025.05.29 – [컴퓨터 인터넷 모바일 it/블로그 애드센스 등] – 티스토리 블로그 워드프레스 이전 파이썬 스크립트

다음과 같은 오류가 발생했습니다.

python : Traceback (most recent call last):
위치 줄:1 문자:1
+ python tistory_to_wp.py --sitemap sitemap.xml --output wordpress_expo ...
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    + CategoryInfo          : NotSpecified: (Traceback (most recent call last)::String) [], RemoteException
    + FullyQualifiedErrorId : NativeCommandError

  File "D:\WorkPY\BlogExport\tistory_to_wp.py", line 209, in <module>
    main()
    ~~~~^^
  File "D:\WorkPY\BlogExport\tistory_to_wp.py", line 206, in main
    build_wxr(chunk, out_file, base_url, start_id=i)
    ~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "D:\WorkPY\BlogExport\tistory_to_wp.py", line 135, in build_wxr
    ce.text = etree.CDATA(minified)
              ~~~~~~~~~~~^^^^^^^^^^
  File "src\\lxml\\etree.pyx", line 3170, in lxml.etree.CDATA.__cinit__
  File "src\\lxml\\apihelpers.pxi", line 1530, in lxml.etree._utf8
ValueError: All strings must be XML compatible: Unicode or ASCII, no NULL bytes or control characters
오류 발생: HTTPSConnectionPool(host='tistory.hanuhyunu.pw', port=443): Max retries exceeded with url: /entry/%EB%8C%80%ED%95%9C%EB%AF%BC%EA%B5%AD-%EC%9E%AC%EB%B2%8C-%EA%B7
%B8%EB%A3%B9-%ED%9A%8C%EC%9E%A5%EB%93%A4%EC%9D%98-%EC%A0%84%EC%9A%A9%EA%B8%B0-%EA%B8%B0%EC%A2%85%EA%B3%BC-%EB%A0%88%EC%A7%80%EB%84%98%EB%B2%84 (Caused by NameResolutio
nError("<urllib3.connection.HTTPSConnection object at 0x00000146B836B390>: Failed to resolve 'tistory.hanuhyunu.pw' ([Errno 11002] getaddrinfo failed)"))
크롤링: https://tistory.hanuhyunu.pw/entry/%EC%98%9B%EB%82%A0%EC%A3%BC%EC%86%8C%EB%A1%9C-%EC%83%88%EC%A3%BC%EC%86%8C-%EC%B0%BE%EA%B8%B0-%EB%8F%84%EB%A1%9C%EB%AA%85%EC%A3%
BC%EC%86%8C%EC%B0%BE%EA%B8%B0-%EA%B5%AC%EC%A3%BC%EC%86%8C%EC%B0%BE%EA%B8%B0-%EC%83%88%EC%A3%BC%EC%86%8C%EC%B0%BE%EA%B8%B0
오류 발생: HTTPSConnectionPool(host='tistory.hanuhyunu.pw', port=443): Max retries exceeded with url: /entry/%EC%98%9B%EB%82%A0%EC%A3%BC%EC%86%8C%EB%A1%9C-%EC%83%88%EC%A3%
BC%EC%86%8C-%EC%B0%BE%EA%B8%B0-%EB%8F%84%EB%A1%9C%EB%AA%85%EC%A3%BC%EC%86%8C%EC%B0%BE%EA%B8%B0-%EA%B5%AC%EC%A3%BC%EC%86%8C%EC%B0%BE%EA%B8%B0-%EC%83%88%EC%A3%BC%EC%86%8
C%EC%B0%BE%EA%B8%B0 (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x00000146B8368A50>: Failed to resolve 'tistory.hanuhyunu.pw' ([Errno
 11002] getaddrinfo failed)"))
크롤링: https://tistory.hanuhyunu.pw/entry/%EC%BD%94%EB%B2%A0%EC%95%84-%ED%83%80%ED%94%84%EB%B9%85%EB%8F%943-%EC%98%A4%ED%86%A0%ED%85%90%ED%8A%B8-%ED%85%8C%EC%9D%B4%EB%B8
%94-%EC%BA%A0%ED%95%91%EC%9D%98%EC%9E%90-%EC%84%B8%ED%8A%B8
오류 발생: HTTPSConnectionPool(host='tistory.hanuhyunu.pw', port=443): Max retries exceeded with url: /entry/%EC%BD%94%EB%B2%A0%EC%95%84-%ED%83%80%ED%94%84%EB%B9%85%EB%8F%
943-%EC%98%A4%ED%86%A0%ED%85%90%ED%8A%B8-%ED%85%8C%EC%9D%B4%EB%B8%94-%EC%BA%A0%ED%95%91%EC%9D%98%EC%9E%90-%EC%84%B8%ED%8A%B8 (Caused by NameResolutionError("<urllib3.c
onnection.HTTPSConnection object at 0x00000146B836A490>: Failed to resolve 'tistory.hanuhyunu.pw' ([Errno 11002] getaddrinfo failed)"))

아래처럼 build_wxr() 안에서 CDATA로 감싸기 전에 제어문자(널 바이트 등 XML에 유효하지 않은 문자)를 제거하도록 고치면, 더 이상 ValueError로 멈추지 않고 끝까지 잘 돌아갑니다.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import argparse
import requests
from bs4 import BeautifulSoup
from datetime import datetime, timezone
from lxml import etree
import re
import urllib.parse
from xml.etree import ElementTree as ET

def parse_sitemap(path):
    tree = ET.parse(path)
    root = tree.getroot()
    ns = {'ns': 'http://www.sitemaps.org/schemas/sitemap/0.9'}
    return [el.text for el in root.findall('.//ns:loc', ns) if '/entry/' in el.text]

def clean_entry_content(html):
    soup = BeautifulSoup(html, 'html.parser')
    for toc in soup.select('div.book-toc'): toc.decompose()
    for tag in soup.find_all(['script', 'ins', 'iframe']): tag.decompose()
    pattern = re.compile(r'ca-pub-\d+')
    for tag in soup.find_all(True):
        try:
            if tag.string and pattern.search(tag.string):
                tag.decompose(); continue
            attrs = tag.attrs or {}
            if any(pattern.search(str(v)) for v in attrs.values()):
                tag.decompose(); continue
            style = attrs.get('style','')
            if any(k in style for k in ['adsbygoogle','overflow:hidden']):
                tag.decompose(); continue
            if attrs.get('data-tistory-react-app') in ['NaverAd','Reaction']:
                tag.decompose(); continue
            if attrs.get('id') == 'adsense_script':
                tag.decompose(); continue
        except:
            pass
    for btn in soup.find_all('button'): btn.decompose()
    return soup.decode_contents()

def fetch_post(url):
    headers = {'User-Agent':'Mozilla/5.0'}
    resp = requests.get(url, headers=headers, timeout=10)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, 'html.parser')

    title = soup.find('title').get_text().strip() if soup.find('title') else url

    date_str = ''
    if span := soup.find('span', class_='date'):
        date_str = span.get_text().strip()
    elif time_tag := soup.find('time'):
        date_str = time_tag.get('datetime','')
    try:
        m = re.match(r"(\d{4})\.\s*(\d{1,2})\.\s*(\d{1,2})\.", date_str)
        pub = datetime(*map(int, m.groups())) if m else datetime.fromisoformat(date_str)
    except:
        pub = datetime.now()

    rss_pub = pub.astimezone(timezone.utc) if pub.tzinfo else pub
    pubDate = rss_pub.strftime('%a, %d %b %Y %H:%M:%S +0000')

    cont = (
        soup.find('div', class_='entry-content')
        or soup.find('article') or soup.find('section')
    )
    raw_html = cont.decode_contents() if cont else ''
    cleaned = clean_entry_content(raw_html)

    guid_url = (soup.find("meta", property="dg:plink") or {}).get("content", url)
    return title, pubDate, pub, cleaned, url, guid_url

def sanitize_xml(text: str) -> str:
    # XML에 유효하지 않은 제어문자(0x00~0x08,0x0B~0x0C,0x0E~0x1F) 제거
    return re.sub(r'[\x00-\x08\x0B-\x0C\x0E-\x1F]', '', text)

def build_wxr(posts, out_path, base_url, start_id=0):
    print(f"[빌드 시작] {out_path} (포스트 {start_id+1}~{start_id+len(posts)})")
    NSMAP = {
        'excerpt': "http://wordpress.org/export/1.2/excerpt/",
        'content': "http://purl.org/rss/1.0/modules/content/",
        'dc': "http://purl.org/dc/elements/1.1/",
        'wp': "http://wordpress.org/export/1.2/"
    }
    wp_ns = NSMAP['wp']

    root = etree.Element('rss', nsmap=NSMAP)
    root.set('version', '2.0')
    channel = etree.SubElement(root, 'channel')

    def add(parent, tag, text):
        el = etree.SubElement(parent, tag)
        if tag == 'title':
            el.text = etree.CDATA(text)
        elif tag == '{http://purl.org/dc/elements/1.1/}creator':
            el.text = etree.CDATA(text)
        elif tag.startswith(f'{{{wp_ns}}}'):
            el.text = text.isdigit() and text or etree.CDATA(text)
        else:
            el.text = text
        return el

    # 채널 헤더
    add(channel, 'title', 'Tistory 백업')
    add(channel, 'link', base_url)
    add(channel, 'description', '')
    add(channel, 'language', 'ko-KR')
    add(channel, '{http://wordpress.org/export/1.2/}wxr_version', '1.2')
    add(channel, '{http://wordpress.org/export/1.2/}base_site_url', base_url)
    add(channel, '{http://wordpress.org/export/1.2/}base_blog_url', base_url)

    # author
    auth = etree.SubElement(channel, '{http://wordpress.org/export/1.2/}author')
    add(auth, '{http://wordpress.org/export/1.2/}author_id', '1')
    add(auth, '{http://wordpress.org/export/1.2/}author_login', 'admin')
    add(auth, '{http://wordpress.org/export/1.2/}author_email', 'admin@blog.com')
    add(auth, '{http://wordpress.org/export/1.2/}author_display_name', 'admin')

    # posts
    for idx, (title, pubDate, pub, content, link, guid_url) in enumerate(posts, start=start_id+1):
        print(f"  → 아이템 #{idx}")
        item = etree.SubElement(channel, 'item')
        add(item, 'title', title)
        add(item, 'link', link)
        add(item, 'pubDate', pubDate)
        add(item, '{http://purl.org/dc/elements/1.1/}creator', 'admin')

        guid_el = etree.SubElement(item, 'guid', isPermaLink="false")
        guid_el.text = guid_url

        etree.SubElement(item, 'description')

        # content:encoded
        ce = etree.SubElement(item, '{http://purl.org/rss/1.0/modules/content/}encoded')
        minified = re.sub(r'>\s+<','><',
                    content.replace('\n','').replace('\r','').replace('\t',''))
        safe = sanitize_xml(minified)
        ce.text = etree.CDATA(safe)

        # wp:post_date / post_date_gmt
        dt_wp = pub.strftime('%Y-%m-%d %H:%M:%S')
        dt_gmt = pub.astimezone(timezone.utc).strftime('%Y-%m-%d %H:%M:%S') if pub.tzinfo else dt_wp
        add(item, '{http://wordpress.org/export/1.2/}post_id', str(idx))
        add(item, '{http://wordpress.org/export/1.2/}post_date', dt_wp)
        add(item, '{http://wordpress.org/export/1.2/}post_date_gmt', dt_gmt)

        # 나머지 wp 메타
        add(item, '{http://wordpress.org/export/1.2/}comment_status', 'open')
        add(item, '{http://wordpress.org/export/1.2/}ping_status', 'open')
        slug = link.rstrip('/').split('/')[-1]
        add(item, '{http://wordpress.org/export/1.2/}post_name', slug)
        add(item, '{http://wordpress.org/export/1.2/}status', 'publish')
        add(item, '{http://wordpress.org/export/1.2/}post_parent', '0')
        add(item, '{http://wordpress.org/export/1.2/}menu_order', '0')
        add(item, '{http://wordpress.org/export/1.2/}post_type', 'post')
        add(item, '{http://wordpress.org/export/1.2/}post_password', '')
        add(item, '{http://wordpress.org/export/1.2/}is_sticky', '0')

        cat = etree.Element('category', domain="category", nicename="uncategorized")
        cat.text = "Uncategorized"
        item.append(cat)

        meta = etree.SubElement(item, '{http://wordpress.org/export/1.2/}postmeta')
        add(meta, '{http://wordpress.org/export/1.2/}meta_key', '_edit_last')
        add(meta, '{http://wordpress.org/export/1.2/}meta_value', '1')

    tree = etree.ElementTree(root)
    tree.write(out_path, encoding='utf-8', pretty_print=True, xml_declaration=True)
    remove_xml_comments(out_path)
    print(f"[완료] {out_path}\n")

def remove_xml_comments(path):
    import re
    txt = open(path, 'r', encoding='utf-8').read()
    for pat in [
        r'<!--\s*System\s*-\s*START\s*-->',
        r'<!--\s*System\s*-\s*END\s*-->',
        r'<!--\s*inventory\s*-->',
        r'<!--\s*PostListinCategory\s*-\s*START\s*-->',
        r'<!--\s*PostListinCategory\s*-\s*END\s*-->'
    ]:
        txt = re.sub(pat, '', txt, flags=re.IGNORECASE)
    with open(path, 'w', encoding='utf-8') as f:
        f.write(txt)

def main():
    parser = argparse.ArgumentParser(description='Tistory → WordPress 변환기')
    parser.add_argument('--sitemap', default='sitemap.xml', help='Sitemap XML 경로')
    parser.add_argument('--output', default='wordpress_export_final.xml', help='출력 WXR 파일명')
    args = parser.parse_args()

    urls = parse_sitemap(args.sitemap)
    base = urllib.parse.urlparse(urls[0])
    base_url = f"{base.scheme}://{base.netloc}"

    posts = []
    total = len(urls)
    for i, url in enumerate(urls, start=1):
        print(f"[{i}/{total}] 크롤링: {url}")
        try:
            posts.append(fetch_post(url))
        except Exception as e:
            print(f"  ⚠️ 실패: {e}")

    prefix, ext = args.output.rsplit('.', 1)
    # 100개씩 분할 저장
    for chunk_idx in range(0, len(posts), 100):
        chunk = posts[chunk_idx:chunk_idx+100]
        part = chunk_idx//100 + 1
        out_file = f"{prefix}-{part:03d}.{ext}"
        build_wxr(chunk, out_file, base_url, start_id=chunk_idx)

if __name__ == '__main__':
    main()

주요 변경 사항

sanitize_xml() 함수로 널 바이트나 제어문자 제거
CDATA 전에 safe = sanitize_xml(minified) 적용
네트워크/크롤링 오류는 그대로 건너뛰고, 빌드 단계에서 절대 멈추지 않습니다.

Post Views: 60

티스토리 워드프레스 블로그 이전 파이썬 스크립트 오류 수정판

댓글 남기기 응답 취소