MovableTypeからBloggerへ（その２）

先日、MovableTypeからBloggerに移行する記事を書いたが、その中でいくつか問題を発見したので、変換ツールのコードを変更した。
問題①
時刻がずれる。（9時間進んでしまう）
これは、元のスクリプトがローカル時刻で出力するにも関わらず、UTCであると宣言しているため。日本時間（UTC+9）を考慮するように変更。
問題②
MovableTypeで記事を作成したときのフォーマットによっては、改行が無駄に入ってしまう。
網羅していないフォーマット種別はあるが、分かる範囲で改行追加の有無を判断するようにした。

--- google-blog-converters-r89/src/movabletype2blogger/mt2b.py	2010-05-28 12:21:06.000000000 +0900
+++ google-blog-converters-r89_new/src/movabletype2blogger/mt2b.py	2020-06-20 21:32:30.372455400 +0900
@@ -20,6 +20,7 @@
 import re
 import sys
 import time
+import datetime
 from xml.sax.saxutils import escape
 
 import gdata
@@ -27,6 +28,16 @@
 
 __author__ = 'JJ Lueck (jlueck@gmail.com)'
 
+timezone_gap = 9     # This original script generate timestamp in UTC while it is local time in fact. 
+                     # and Blogger seems to import entries with considering user's 
+                     # timezone. Thus, when importing from Japanese timezone (UTC+9), entries 
+                     # will have timestamp of imported data + 9 hours. 
+                     # E.g. MovableType timestamp:     05/21/2020 09:33:30 PM
+                     #      Output of original script: 2020-05-21T21:33:30Z (when ran in Japan)
+                     #      Timestamp on Blogger:      2020/05/22 06:33     (when imported from Japan)
+                     # To avoid this, need adjust timezone. 
+                     # This value is set to "+9" of "UTC+9"=Japan standard time. 
+
 ########################
 # Constants
 ########################
@@ -114,15 +125,31 @@
     last_entry = None    # The previous post atom.Entry if exists
     tag_name = None      # The current name of multi-line values
     tag_contents = ''    # The contents of multi-line values
+    linenum = 0          # Number of line being processed
+    tag_separator = ''   # keep the line with '-' only because it might be 
+                         # treated as a tag separator ('-' * 8 or '-' * 5) wrongly
+    tag_name_back = tag_name  # In case of wrong treatment of tag separator, copy tag_name
+    #_EXTENDED_BODY_SEPARATOR_ = '<br /><a name=\'more\'></a>'  
+                         # blogger uses "<!--more-->" as the separator and it is conveted to  
+                         # the above when exported. But it will be imported as 
+                         # "<a href="https://www.blogger.com/null" name="more"></a>", instead
+                         # of original separator - "<!--more-->". 
+    _EXTENDED_BODY_SEPARATOR_ = '<!--more-->'
+    extended_body_separator = _EXTENDED_BODY_SEPARATOR_ 
+                         # Also used as a flag indicating separator of extended body from 
+                         # main body is added into output or not. 
+    convert_breaks = '<br/>' # flag to mark if <br/> need be added or not
 
     # Loop through the text lines looking for key/value pairs
     for line in infile:
+      linenum+=1
 
       # Remove whitespace
       line = line.strip().lstrip(codecs.BOM_UTF8)
 
       # Check for the post ending token
       if line == '-' * 8:
+        tag_separator = line
         if post_entry:
           # If the body tag is still being read, add what has been read.
           if tag_name == 'BODY':
@@ -137,11 +164,12 @@
         post_entry = None
         comment_entry = None
         tag_name = None
-        tag_contents = ''
+        tag_contents = '\n'
         continue
 
       # Check for the tag ending separator
       elif line == '-' * 5:
+        tag_separator = line
         # Get the contents of the body and set the entry contents
         if tag_name == 'BODY':
           post_entry.content = atom.Content(
@@ -162,9 +190,10 @@
         # entry contents
         elif tag_name == 'EXTENDED BODY':
           if post_entry:
-            post_entry.content.text += '<br/>' + self._TranslateContents(tag_contents)
+            post_entry.content.text += extended_body_separator + self._TranslateContents(tag_contents)
           elif last_entry and last_entry.content:
-            last_entry.content.text += '<br/>' + self._TranslateContents(tag_contents)
+            last_entry.content.text += extended_body_separator + self._TranslateContents(tag_contents)
+          extended_body_separator = convert_breaks
 
         # Convert any keywords (comma separated values) into Blogger labels
         elif tag_name == 'KEYWORDS':
@@ -175,6 +204,7 @@
                   atom.Category(scheme=CATEGORY_NS, term=keyword))
 
         # Reset the current tag and its contents
+        tag_name_back = tag_name
         tag_name = None
         tag_contents = ''
         continue
@@ -261,20 +291,59 @@
       # on following lines
       elif key in ('COMMENT', 'BODY', 'EXTENDED BODY', 'EXCERPT', 'KEYWORDS', 'PING'):
         tag_name = key
+        tag_separator = ''
+        extended_body_separator = _EXTENDED_BODY_SEPARATOR_ 
 
       # These lines can be safely ignored
-      elif key in ('BASENAME', 'ALLOW COMMENTS', 'CONVERT BREAKS',
+      elif key in ('BASENAME', 'ALLOW COMMENTS', 'CONVERT BREAKS', 
                    'ALLOW PINGS', 'PRIMARY CATEGORY', 'IP', 'URL', 'EMAIL'):
+        tag_separator = ''
+        extended_body_separator = _EXTENDED_BODY_SEPARATOR_ 
+
+        if key in ('CONVERT BREAKS'):
+          if value in ( '__default__', 'markdown', 'markdown_with_smartypants','textile_2'):
+            convert_breaks = '<br/>'
+          elif ( value == '0'):
+            convert_breaks = ''
+          else:
+            convert_breaks = '<br/>'
+            sys.stderr.write('Warn: ' + str(linenum) + ': cannot be processed. Ignored. "' + line + '"\n')
         continue
 
-      # If the line is empty and we're processing the body, add an HTML line
-      # break
-      elif tag_name == 'BODY' and len(line) == 0:
-        tag_contents += '<br/>'
+      # If we're processing the body, extended body, or comment, add the line 
+      # because it would be a part of concent. 
+      # Note: Excerpt would have the same situation, but blogger does not have excerpt field and this program will ignore it. 
+      elif tag_name in ('BODY', 'EXTENDED BODY', 'COMMENT', 'KEYWORDS'):
+        #sys.stderr.write('Normal: ' + str(linenum) + ': assumed the line is part of "' + tag_name+ '" content: "' + line + '"\n')
+        if tag_name in ('BODY', 'EXTENDED BODY', 'COMMENT'):
+          tag_contents += line + convert_breaks
+        else:
+          if tag_contents != "":
+            tag_contents += ","
+          tag_contents += line
+
+      # EXCERPT and PING (=Trackback) are ignored because they are not supported by Blogger. 
+      elif tag_name in ('EXCERPT', 'PING') and len(line)!=0:
+        sys.stderr.write('Warn: ' + str(linenum) + ': "' +tag_name+ '" is ignored by blogger: "' + line + '"\n')
 
       # This would be a line of content beyond a key/value pair
-      elif len(key) != 0:
-        tag_contents += line + '\n'
+      # Add '-----' or '--------' because they located at 1 line above this line would be 
+      # treated as a tag seprator and omitted unexpectedly. 
+      elif len(key) != 0: #and not ( tag_name in ('EXCERPT', 'PING') ):
+        if (tag_separator != ''): 
+          if ( not tag_name ): 
+            tag_name = tag_name_back
+            sys.stderr.write('Warn: ' + str(linenum) + ': need recover tag_name to "' + tag_name +'": "' + line + '"\n')
+          tag_contents += tag_separator + convert_breaks
+          sys.stderr.write('Warn: ' + str(linenum) + ': "' + tag_separator + '" was added back to previous line: "' + line + '"\n')
+          tag_separator = ''
+        sys.stderr.write('Warn: ' + str(linenum) + ': assumed the line is part of "' + tag_name+ '" content: "' + line + '"\n')
+        tag_contents += line + convert_breaks
+
+      # Lines unable to process
+      elif line != '': 
+      #else: 
+        sys.stderr.write('Error: ' + str(linenum) + ': cannot process line: "' + line + '"\n')
 
 
     # Update the feed with the last updated time
@@ -307,7 +376,8 @@
 
   def _FromMtTime(self, mt_time):
     try:
-      return time.strptime(mt_time, "%m/%d/%Y %I:%M:%S %p")
+      dt3 = datetime.datetime.strptime(mt_time, "%m/%d/%Y %I:%M:%S %p") +  datetime.timedelta(hours = -1 * timezone_gap)
+      return dt3.timetuple()
     except ValueError:
       return time.gmtime()
B忘log - Hi-LoのBlog

2020/06/25

MovableTypeからBloggerへ（その２）

0 件のコメント: