diff --git a/ImportConsole/ImportConsole.csproj b/ImportConsole/ImportConsole.csproj index 5adaa62..e912e44 100644 --- a/ImportConsole/ImportConsole.csproj +++ b/ImportConsole/ImportConsole.csproj @@ -52,6 +52,7 @@ + diff --git a/ImportConsole/Program.cs b/ImportConsole/Program.cs index 5c1aa1e..537b94c 100644 --- a/ImportConsole/Program.cs +++ b/ImportConsole/Program.cs @@ -9,19 +9,22 @@ using FLocal.Common; namespace FLocal.ImportConsole { class Program { public static void Main(string[] args) { + Consolery.Run(typeof(Program), args); + } + private static void initializeConfig() { if(!Config.isInitialized) { lock(typeof(Config)) { if(!Config.isInitialized) { Config.Init(ConfigurationManager.AppSettings); } } - Consolery.Run(typeof(Program), args); } } [Action] public static void ImportUsers() { + initializeConfig(); try { UsersImporter.ImportUsers(); } catch(Exception e) { @@ -32,7 +35,13 @@ namespace FLocal.ImportConsole { [Action] public static void ProcessUpload(string pathToUpload) { + initializeConfig(); UploadProcessor.ProcessUpload(pathToUpload); } + + [Action] + public static void ConvertThreaded(string pathToThreaded, string outFile) { + ThreadedHTMLProcessor.Process(pathToThreaded, outFile); + } } } diff --git a/ImportConsole/ThreadedHTMLProcessor.cs b/ImportConsole/ThreadedHTMLProcessor.cs new file mode 100644 index 0000000..652af45 --- /dev/null +++ b/ImportConsole/ThreadedHTMLProcessor.cs @@ -0,0 +1,330 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.IO; +using System.Text.RegularExpressions; + +namespace FLocal.ImportConsole { + class ThreadedHTMLProcessor { + + private readonly static DateTime UNIX = new DateTime(1970, 1, 1, 0, 0, 0).ToLocalTime(); + + private readonly static Regex PARENT_BEGINMARKER = new Regex("\\["; + private const string POST_ENDMARKER_FULL_SIGNATURE = "
"; + private readonly static Regex POST_ENDMARKER_FULL = new Regex("\\s*\\s*\\s*\\s*\\s*\\s*\\s*
\\s*", RegexOptions.Compiled | RegexOptions.IgnoreCase | RegexOptions.Singleline); + private const string POST_ENDMARKER_FULL_DISCUSSION = "

\\s*\\s*\\s*
.*", RegexOptions.Compiled | RegexOptions.IgnoreCase | RegexOptions.Singleline); + private const string TITLE_ENDMARKER_FULL = ""; + private const string THREAD_BEGINMARKER_FULL = "type=favorite&Number="; + private const string THREAD_ENDMARKER_FULL = "&"; + + private const string POST_BEGINMARKER_LITE = "
"; + private const string POST_ENDMARKER_LITE = ""; + private const string TITLE_BEGINMARKER_LITE = ""; + private const string TITLE_ENDMARKER_LITE = ""; + private const string THREAD_BEGINMARKER_LITE = "view=&sb=&o=&thread="; + private const string THREAD_ENDMARKER_LITE = "&"; + + + public static void Process(string pathToThreadeds, string pathToOutput) { + + using(StreamWriter writer = new StreamWriter(pathToOutput, false, Encoding.ASCII)) { + DirectoryInfo directoryInfo = new DirectoryInfo(pathToThreadeds); + int i=0; + foreach(FileSystemInfo _info in directoryInfo.GetFiles()) { + if(i%100 == 0) { + Console.Write("[" + (int)(i/100) + "]"); + } + if(!(_info is FileInfo)) continue; + FileInfo info = (FileInfo)_info; + string[] parts = info.Name.Split('.'); + if((parts.Length != 2) || (parts[1].ToLower() != "txt")) continue; + int postId = int.Parse(parts[0]); + try { + string contentPost; + string contentTitle; + DateTime contentDate; + int? contentParent = null; + int contentThread; + string contentPoster; + int contentLayerId = 1; + string contentBoard; + using(StreamReader reader = new StreamReader(info.FullName, Importer.ShallerGateway.encoding)) { + string raw = reader.ReadToEnd(); + if(raw.Contains("-CATJUMP-1")) { + //full mode + string beforeBegin; + { + int beginPos = raw.IndexOf(POST_BEGINMARKER_FULL); + if(beginPos <= 0) throw new ApplicationException("beginPos <= 0"); + beforeBegin = raw.Substring(0, beginPos); + string afterBegin = raw.Substring(beginPos + POST_BEGINMARKER_FULL.Length); + int endPos; + endPos = afterBegin.IndexOf(POST_ENDMARKER_FULL_DISCUSSION); + if(endPos <= 0) { + endPos = afterBegin.IndexOf(POST_ENDMARKER_FULL_SIGNATURE); + if(endPos <= 0) { + Match endBodyMatch = POST_ENDMARKER_FULL.Match(afterBegin); + if(!endBodyMatch.Success) { + Console.WriteLine("afterBegin:"); + Console.WriteLine("==========================="); + Console.WriteLine(afterBegin); + Console.WriteLine("==========================="); + Console.WriteLine(POST_ENDMARKER_FULL.ToString()); + throw new ApplicationException("cannot match body end"); + } + endPos = endBodyMatch.Index; + } + } + contentPost = afterBegin.Substring(0, endPos); + } + + { + Match titleMatch = TITLE_BEGINMARKER_FULL.Match(beforeBegin); + if(!titleMatch.Success) throw new ApplicationException("cannot match title begin"); + string afterTitleBegin = beforeBegin.Substring(titleMatch.Index + titleMatch.Length); + int titleEndPos = afterTitleBegin.IndexOf(TITLE_ENDMARKER_FULL); + if(titleEndPos <= 0) { + throw new ApplicationException("titleEndPos <= 0"); + } + contentTitle = afterTitleBegin.Substring(0, titleEndPos); + } + + { + Match dateMatch = DATE_MATCH.Match(beforeBegin); + if(!dateMatch.Success) { + throw new ApplicationException("cannot match date"); + } + contentDate = new DateTime(int.Parse(dateMatch.Groups[3].Value), int.Parse(dateMatch.Groups[2].Value), int.Parse(dateMatch.Groups[1].Value), int.Parse(dateMatch.Groups[4].Value), int.Parse(dateMatch.Groups[5].Value), 0); + } + + { + Match parentMatch = PARENT_BEGINMARKER.Match(beforeBegin); + if(parentMatch.Success) { + string afterParentBegin = beforeBegin.Substring(parentMatch.Index + parentMatch.Length); + int parentEndPos = afterParentBegin.IndexOf(PARENT_ENDMARKER); + if(parentEndPos <= 0) { + throw new ApplicationException("parentEndPos <= 0"); + } + contentParent = int.Parse(afterParentBegin.Substring(0, parentEndPos)); + } + } + + { + int posterBeginPos = beforeBegin.IndexOf(POSTER_BEGINMARKER); + if(posterBeginPos > 0) { + string afterPosterBegin = beforeBegin.Substring(posterBeginPos + POSTER_BEGINMARKER.Length); + int posterEndPos = afterPosterBegin.IndexOf(POSTER_ENDMARKER); + if(posterEndPos <= 0) { + throw new ApplicationException("posterEndPos <= 0"); + } + contentPoster = afterPosterBegin.Substring(0, posterEndPos); + } else { + posterBeginPos = beforeBegin.IndexOf(POSTER_BEGINMARKER_GUEST); + if(posterBeginPos <= 0) { + throw new ApplicationException("posterBeginPos <= 0"); + } + string afterPosterBegin = beforeBegin.Substring(posterBeginPos + POSTER_BEGINMARKER_GUEST.Length); + int posterEndPos = afterPosterBegin.IndexOf(POSTER_ENDMARKER); + if(posterEndPos <= 0) { + throw new ApplicationException("posterEndPos <= 0"); + } + contentPoster = "Guest " + afterPosterBegin.Substring(0, posterEndPos); + } + } + + { + int threadBeginPos = raw.IndexOf(THREAD_BEGINMARKER_FULL); + if(threadBeginPos <= 0) { + throw new ApplicationException("threadbeginpos <= 0"); + } + string afterThreadBegin = raw.Substring(threadBeginPos + THREAD_BEGINMARKER_FULL.Length); + int threadEndPos = afterThreadBegin.IndexOf(THREAD_ENDMARKER_FULL); + if(threadEndPos <= 0) { + throw new ApplicationException("threadEndPos <= 0"); + } + contentThread = int.Parse(afterThreadBegin.Substring(0, threadEndPos)); + } + + { + int boardBeginPos = beforeBegin.IndexOf(BOARD_BEGINMARKER); + if(boardBeginPos <= 0) { + throw new ApplicationException("boardbeginpos <= 0"); + } + string afterBoardBegin = beforeBegin.Substring(boardBeginPos + BOARD_BEGINMARKER.Length); + int boardEndPos = afterBoardBegin.IndexOf(BOARD_ENDMARKER); + if(boardEndPos <= 0) { + throw new ApplicationException("boardEndPos <= 0"); + } + contentBoard = afterBoardBegin.Substring(0, boardEndPos); + } + + if(beforeBegin.IndexOf("trash.gif") > 0) { + contentLayerId = 3; + } else if(beforeBegin.IndexOf("eye.gif") > 0) { + contentLayerId = 2; + } + + } else { + //lite mode + string beforeBegin; + { + int beginPos = raw.IndexOf(POST_BEGINMARKER_LITE); + if(beginPos <= 0) throw new ApplicationException("beginPos <= 0"); + beforeBegin = raw.Substring(0, beginPos); + string afterBegin = raw.Substring(beginPos + POST_BEGINMARKER_LITE.Length); + int endPos; + endPos = afterBegin.IndexOf(POST_ENDMARKER_LITE); + if(endPos <= 0) { + throw new ApplicationException("cannot match body end"); + } + contentPost = afterBegin.Substring(0, endPos); + } + + { + int titleBeginPos = beforeBegin.IndexOf(TITLE_BEGINMARKER_LITE); + if(titleBeginPos <= 0) { + throw new ApplicationException("titlebeginpos <= 0"); + } + string afterTitleBegin = beforeBegin.Substring(titleBeginPos + TITLE_BEGINMARKER_LITE.Length); + int titleEndPos = afterTitleBegin.IndexOf(TITLE_ENDMARKER_LITE); + if(titleEndPos <= 0) { + throw new ApplicationException("titleEndPos <= 0"); + } + contentTitle = afterTitleBegin.Substring(0, titleEndPos); + } + + { + Match dateMatch = DATE_MATCH.Match(beforeBegin); + if(!dateMatch.Success) { + throw new ApplicationException("cannot match date"); + } + contentDate = new DateTime(int.Parse(dateMatch.Groups[3].Value), int.Parse(dateMatch.Groups[2].Value), int.Parse(dateMatch.Groups[1].Value), int.Parse(dateMatch.Groups[4].Value), int.Parse(dateMatch.Groups[5].Value), 0); + } + + { + Match parentMatch = PARENT_BEGINMARKER.Match(beforeBegin); + if(parentMatch.Success) { + string afterParentBegin = beforeBegin.Substring(parentMatch.Index + parentMatch.Length); + int parentEndPos = afterParentBegin.IndexOf(PARENT_ENDMARKER); + if(parentEndPos <= 0) { + throw new ApplicationException("parentEndPos <= 0"); + } + contentParent = int.Parse(afterParentBegin.Substring(0, parentEndPos)); + } + } + + { + int posterBeginPos = beforeBegin.IndexOf(POSTER_BEGINMARKER); + if(posterBeginPos > 0) { + string afterPosterBegin = beforeBegin.Substring(posterBeginPos + POSTER_BEGINMARKER.Length); + int posterEndPos = afterPosterBegin.IndexOf(POSTER_ENDMARKER); + if(posterEndPos <= 0) { + throw new ApplicationException("posterEndPos <= 0"); + } + contentPoster = afterPosterBegin.Substring(0, posterEndPos); + } else { + posterBeginPos = beforeBegin.IndexOf(POSTER_BEGINMARKER_GUEST); + if(posterBeginPos <= 0) { + //if(!beforeBegin.Contains("Anonymous")) { + //throw new ApplicationException("posterBeginPos <= 0"); + //} else { + contentPoster = "Anonymous"; + //} + } else { + string afterPosterBegin = beforeBegin.Substring(posterBeginPos + POSTER_BEGINMARKER_GUEST.Length); + int posterEndPos = afterPosterBegin.IndexOf(POSTER_ENDMARKER); + if(posterEndPos <= 0) { + throw new ApplicationException("posterEndPos <= 0"); + } + contentPoster = "Guest " + afterPosterBegin.Substring(0, posterEndPos); + } + } + } + + { + int threadBeginPos = beforeBegin.IndexOf(THREAD_BEGINMARKER_LITE); + if(threadBeginPos <= 0) { + throw new ApplicationException("threadbeginpos <= 0"); + } + string afterThreadBegin = beforeBegin.Substring(threadBeginPos + THREAD_BEGINMARKER_LITE.Length); + int threadEndPos = afterThreadBegin.IndexOf(THREAD_ENDMARKER_LITE); + if(threadEndPos <= 0) { + throw new ApplicationException("threadEndPos <= 0"); + } + contentThread = int.Parse(afterThreadBegin.Substring(0, threadEndPos)); + } + + { + int boardBeginPos = beforeBegin.IndexOf(BOARD_BEGINMARKER); + if(boardBeginPos <= 0) { + throw new ApplicationException("boardbeginpos <= 0"); + } + string afterBoardBegin = beforeBegin.Substring(boardBeginPos + BOARD_BEGINMARKER.Length); + int boardEndPos = afterBoardBegin.IndexOf(BOARD_ENDMARKER); + if(boardEndPos <= 0) { + throw new ApplicationException("boardEndPos <= 0"); + } + contentBoard = afterBoardBegin.Substring(0, boardEndPos); + } + + if(beforeBegin.IndexOf("(xx)") > 0) { + contentLayerId = 3; + } else if(beforeBegin.IndexOf("(x)") > 0) { + contentLayerId = 2; + } + } + } + if(!contentParent.HasValue) contentParent = 0; + contentTitle = contentTitle.Trim(); + contentPost = contentPost.Trim(); + /*Console.WriteLine("============================="); + Console.WriteLine("PostId: " + postId.ToString()); + Console.WriteLine("Board: " + contentBoard); + Console.WriteLine("Layer: " + contentLayerId.ToString()); + Console.WriteLine("Date: " + contentDate.ToString()); + Console.WriteLine("Parent: " + contentParent.ToString()); + Console.WriteLine("Thread: " + contentThread.ToString()); + Console.WriteLine("Poster: " + contentPoster); + Console.WriteLine("Title: " + contentTitle); + Console.WriteLine("Body: " + contentPost); + Console.ReadLine();*/ + writer.WriteLine( + Importer.DictionaryConverter.ToDump( + new Dictionary { + { "Subject", contentTitle }, + { "Board", contentBoard }, + { "UnixTime", ((int)(contentDate.Subtract(UNIX).TotalSeconds)).ToString() }, + { "Parent", contentParent.ToString() }, + { "Main", contentThread.ToString() }, + { "Local_Main", contentThread.ToString() }, + { "Username", contentPoster }, + { "Body", contentPost }, + { "Layer", contentLayerId.ToString() }, + } + ) + ); + Console.Write("+"); + } catch(Exception e) { + Console.Error.WriteLine("Could not process post #" + postId + ": " + e.GetType().FullName + ": " + e.Message); + Console.Error.WriteLine(e.StackTrace); + } finally { + i++; + } + } + } + + } + + } +} diff --git a/Importer/DictionaryConverter.cs b/Importer/DictionaryConverter.cs new file mode 100644 index 0000000..22fb47e --- /dev/null +++ b/Importer/DictionaryConverter.cs @@ -0,0 +1,19 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Web; + +namespace FLocal.Importer { + public static class DictionaryConverter { + + public static string ToDump(Dictionary dict) { + return string.Join(" ", (from kvp in dict select HttpUtility.UrlEncode(kvp.Key, ShallerConnector.encoding) + "=" + HttpUtility.UrlEncode(kvp.Value, ShallerConnector.encoding)).ToArray()); + } + + public static Dictionary FromDump(string dump) { + return (from elem in dump.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries) let parts = elem.Split(new char[] { '=' }, 2) select new KeyValuePair(HttpUtility.UrlDecode(parts[0], ShallerConnector.encoding), HttpUtility.UrlDecode(parts[1], ShallerConnector.encoding))).ToDictionary(kvp => kvp.Key, kvp => kvp.Value); + } + + } +} diff --git a/Importer/Importer.csproj b/Importer/Importer.csproj index b24a90c..4e32764 100644 --- a/Importer/Importer.csproj +++ b/Importer/Importer.csproj @@ -47,6 +47,7 @@ + diff --git a/Importer/ShallerGateway.cs b/Importer/ShallerGateway.cs index b1a6e96..93cec1d 100644 --- a/Importer/ShallerGateway.cs +++ b/Importer/ShallerGateway.cs @@ -10,6 +10,8 @@ using System.IO; namespace FLocal.Importer { public class ShallerGateway { + public static readonly Encoding encoding = ShallerConnector.encoding; + public static string getUserInfoAsString(string userName) { //if(userName != HttpUtility.UrlEncode(userName, ShallerConnector.encoding)) throw new ApplicationException("'" + userName + "':showprofile.php?User=" + HttpUtility.UrlEncode(userName, ShallerConnector.encoding) + "&What=login&showlite=l"); return ShallerConnector.getPageContent("showprofile.php?User=" + HttpUtility.UrlEncode(userName, ShallerConnector.encoding) + "&What=login&showlite=l", new Dictionary(), new System.Net.CookieContainer());