diff options
| author | Michele Calgaro <michele.calgaro@yahoo.it> | 2024-06-07 23:30:05 +0900 |
|---|---|---|
| committer | Michele Calgaro <michele.calgaro@yahoo.it> | 2024-06-07 23:30:05 +0900 |
| commit | 17b259df9cb6b28779d4881b2b6c805ee2e48eea (patch) | |
| tree | 5ed61937459cb7081089111b0242c01ec178f1f3 /reader/src/formats/html/HtmlBookReader.cpp | |
| parent | 1cba8bce178eb2d6719c6f7f21e2c9352c5513a6 (diff) | |
| download | tde-ebook-reader-17b259df9cb6b28779d4881b2b6c805ee2e48eea.tar.gz tde-ebook-reader-17b259df9cb6b28779d4881b2b6c805ee2e48eea.zip | |
Rename to tde-ebook-reader
Signed-off-by: Michele Calgaro <michele.calgaro@yahoo.it>
Diffstat (limited to 'reader/src/formats/html/HtmlBookReader.cpp')
| -rw-r--r-- | reader/src/formats/html/HtmlBookReader.cpp | 583 |
1 files changed, 583 insertions, 0 deletions
diff --git a/reader/src/formats/html/HtmlBookReader.cpp b/reader/src/formats/html/HtmlBookReader.cpp new file mode 100644 index 0000000..321913d --- /dev/null +++ b/reader/src/formats/html/HtmlBookReader.cpp @@ -0,0 +1,583 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <contact@geometerplus.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <cctype> + +#include <ZLFile.h> +#include <ZLFileImage.h> +#include <ZLStringUtil.h> + +#include "HtmlBookReader.h" +#include "HtmlTagActions.h" +#include "../txt/PlainTextFormat.h" +#include "../util/MiscUtil.h" +#include "../../bookmodel/BookModel.h" +#include "../css/StyleSheetParser.h" + +HtmlTagAction::HtmlTagAction(HtmlBookReader &reader) : myReader(reader) { +} + +HtmlTagAction::~HtmlTagAction() { +} + +void HtmlTagAction::reset() { +} + +DummyHtmlTagAction::DummyHtmlTagAction(HtmlBookReader &reader) : HtmlTagAction(reader) { +} + +void DummyHtmlTagAction::run(const HtmlReader::HtmlTag&) { +} + +HtmlControlTagAction::HtmlControlTagAction(HtmlBookReader &reader, FBTextKind kind) : HtmlTagAction(reader), myKind(kind) { +} + +void HtmlControlTagAction::run(const HtmlReader::HtmlTag &tag) { + std::vector<FBTextKind> &list = myReader.myKindList; + int index; + for (index = list.size() - 1; index >= 0; --index) { + if (list[index] == myKind) { + break; + } + } + if (tag.Start) { + if (index == -1) { + bookReader().pushKind(myKind); + myReader.myKindList.push_back(myKind); + bookReader().addControl(myKind, true); + } + } else { + if (index >= 0) { + for (int i = list.size() - 1; i >= index; --i) { + bookReader().addControl(list[i], false); + bookReader().popKind(); + } + for (unsigned int j = index + 1; j < list.size(); ++j) { + bookReader().addControl(list[j], true); + bookReader().pushKind(list[j]); + } + list.erase(list.begin() + index); + } + } +} + +HtmlHeaderTagAction::HtmlHeaderTagAction(HtmlBookReader &reader, FBTextKind kind) : HtmlTagAction(reader), myKind(kind) { +} + +void HtmlHeaderTagAction::run(const HtmlReader::HtmlTag &tag) { + myReader.myIsStarted = false; + if (tag.Start) { + if (myReader.myBuildTableOfContent && !myReader.myIgnoreTitles) { + if (!bookReader().contentsParagraphIsOpen()) { + bookReader().insertEndOfSectionParagraph(); + bookReader().enterTitle(); + bookReader().beginContentsParagraph(); + } + } + bookReader().pushKind(myKind); + } else { + bookReader().popKind(); + if (myReader.myBuildTableOfContent && !myReader.myIgnoreTitles) { + bookReader().endContentsParagraph(); + bookReader().exitTitle(); + } + } + bookReader().beginParagraph(); +} + +HtmlIgnoreTagAction::HtmlIgnoreTagAction(HtmlBookReader &reader) : HtmlTagAction(reader) { +} + +void HtmlIgnoreTagAction::run(const HtmlReader::HtmlTag &tag) { + if (tag.Start) { + if (myTagNames.find(tag.Name) == myTagNames.end()) { + ++myReader.myIgnoreDataCounter; + myTagNames.insert(tag.Name); + } + } else { + if (myTagNames.find(tag.Name) != myTagNames.end()) { + --myReader.myIgnoreDataCounter; + myTagNames.erase(tag.Name); + } + } +} + +HtmlHrefTagAction::HtmlHrefTagAction(HtmlBookReader &reader) : HtmlTagAction(reader) { +} + +void HtmlHrefTagAction::run(const HtmlReader::HtmlTag &tag) { + if (tag.Start) { + for (unsigned int i = 0; i < tag.Attributes.size(); ++i) { + if (tag.Attributes[i].Name == "NAME") { + bookReader().addHyperlinkLabel(tag.Attributes[i].Value); + } else if ((hyperlinkType() == REGULAR) && (tag.Attributes[i].Name == "HREF")) { + std::string value = tag.Attributes[i].Value; + if (!myReader.myFileName.empty() && + (value.length() > myReader.myFileName.length()) && + (value.substr(0, myReader.myFileName.length()) == myReader.myFileName)) { + value = value.substr(myReader.myFileName.length()); + } + if (!value.empty()) { + if (value[0] == '#') { + setHyperlinkType(INTERNAL_HYPERLINK); + bookReader().addHyperlinkControl(INTERNAL_HYPERLINK, value.substr(1)); + } else { + FBTextKind hyperlinkType = MiscUtil::referenceType(value); + if (hyperlinkType != INTERNAL_HYPERLINK) { + setHyperlinkType(hyperlinkType); + bookReader().addHyperlinkControl(hyperlinkType, value); + } + } + } + } + } + } else if (hyperlinkType() != REGULAR) { + bookReader().addControl(hyperlinkType(), false); + setHyperlinkType(REGULAR); + } +} + +void HtmlHrefTagAction::reset() { + setHyperlinkType(REGULAR); +} + +FBTextKind HtmlHrefTagAction::hyperlinkType() const { + return myHyperlinkType; +} + +void HtmlHrefTagAction::setHyperlinkType(FBTextKind hyperlinkType) { + myHyperlinkType = hyperlinkType; +} + +HtmlImageTagAction::HtmlImageTagAction(HtmlBookReader &reader) : HtmlTagAction(reader) { +} + +void HtmlImageTagAction::run(const HtmlReader::HtmlTag &tag) { + if (tag.Start) { + bookReader().endParagraph(); + for (unsigned int i = 0; i < tag.Attributes.size(); ++i) { + if (tag.Attributes[i].Name == "SRC") { + const std::string fileName = MiscUtil::decodeHtmlURL(tag.Attributes[i].Value); + const ZLFile file(myReader.myBaseDirPath + fileName); + if (file.exists()) { + bookReader().addImageReference(fileName); + bookReader().addImage(fileName, new ZLFileImage(file, 0)); + } + break; + } + } + bookReader().beginParagraph(); + } +} + +HtmlBreakTagAction::HtmlBreakTagAction(HtmlBookReader &reader, BreakType breakType) : HtmlTagAction(reader), myBreakType(breakType) { +} + +void HtmlBreakTagAction::run(const HtmlReader::HtmlTag &tag) { + if (myReader.myDontBreakParagraph) { + myReader.myDontBreakParagraph = false; + return; + } + + if ((tag.Start && (myBreakType & BREAK_AT_START)) || + (!tag.Start && (myBreakType & BREAK_AT_END))) { + bookReader().endParagraph(); + if (bookReader().isKindStackEmpty()) { + bookReader().pushKind(REGULAR); + } + bookReader().beginParagraph(); + } +} + +HtmlPreTagAction::HtmlPreTagAction(HtmlBookReader &reader) : HtmlTagAction(reader) { +} + +void HtmlPreTagAction::run(const HtmlReader::HtmlTag &tag) { + bookReader().endParagraph(); + myReader.myIsPreformatted = tag.Start; + myReader.mySpaceCounter = -1; + myReader.myBreakCounter = 0; + if (myReader.myFormat.breakType() == PlainTextFormat::BREAK_PARAGRAPH_AT_NEW_LINE) { + if (tag.Start) { + bookReader().pushKind(PREFORMATTED); + } else { + bookReader().popKind(); + } + } + bookReader().beginParagraph(); +} + +HtmlListTagAction::HtmlListTagAction(HtmlBookReader &reader, int startIndex) : HtmlTagAction(reader), myStartIndex(startIndex) { +} + +void HtmlListTagAction::run(const HtmlReader::HtmlTag &tag) { + if (tag.Start) { + myReader.myListNumStack.push(myStartIndex); + } else if (!myReader.myListNumStack.empty()) { + myReader.myListNumStack.pop(); + } +} + +HtmlListItemTagAction::HtmlListItemTagAction(HtmlBookReader &reader) : HtmlTagAction(reader) { +} + +void HtmlListItemTagAction::run(const HtmlReader::HtmlTag &tag) { + if (tag.Start) { + bookReader().endParagraph(); + bookReader().beginParagraph(); + if (!myReader.myListNumStack.empty()) { + bookReader().addFixedHSpace(3 * myReader.myListNumStack.size()); + int &index = myReader.myListNumStack.top(); + if (index == 0) { + myReader.addConvertedDataToBuffer("\342\200\242 ", 4, false); + } else { + std::string number; + ZLStringUtil::appendNumber(number, index++); + number += ". "; + myReader.addConvertedDataToBuffer(number.data(), number.length(), false); + } + myReader.myDontBreakParagraph = true; + } + } else { + myReader.myDontBreakParagraph = false; + } +} + +HtmlTableTagAction::HtmlTableTagAction(HtmlBookReader &reader) : HtmlTagAction(reader) { +} + +void HtmlTableTagAction::run(const HtmlReader::HtmlTag &tag) { + if (tag.Start) { + myReader.myIgnoreTitles = true; + } else { + myReader.myIgnoreTitles = false; + } +} + +HtmlStyleTagAction::HtmlStyleTagAction(HtmlBookReader &reader) : HtmlTagAction(reader) { +} + +void HtmlStyleTagAction::run(const HtmlReader::HtmlTag &tag) { + myReader.myStyleSheetParser = tag.Start ? new StyleSheetTableParser(myReader.myStyleSheetTable) : 0; + /* + if (!tag.Start) { + myReader.myStyleSheetTable.dump(); + } + */ +} + +shared_ptr<HtmlTagAction> HtmlBookReader::createAction(const std::string &tag) { + if (tag == "EM") { + return new HtmlControlTagAction(*this, EMPHASIS); + } else if (tag == "STRONG") { + return new HtmlControlTagAction(*this, STRONG); + } else if (tag == "B") { + return new HtmlControlTagAction(*this, BOLD); + } else if (tag == "I") { + return new HtmlControlTagAction(*this, ITALIC); + } else if (tag == "TT") { + return new HtmlControlTagAction(*this, CODE); + } else if (tag == "CODE") { + return new HtmlControlTagAction(*this, CODE); + } else if (tag == "CITE") { + return new HtmlControlTagAction(*this, CITE); + } else if (tag == "SUB") { + return new HtmlControlTagAction(*this, SUB); + } else if (tag == "SUP") { + return new HtmlControlTagAction(*this, SUP); + } else if (tag == "H1") { + return new HtmlHeaderTagAction(*this, H1); + } else if (tag == "H2") { + return new HtmlHeaderTagAction(*this, H2); + } else if (tag == "H3") { + return new HtmlHeaderTagAction(*this, H3); + } else if (tag == "H4") { + return new HtmlHeaderTagAction(*this, H4); + } else if (tag == "H5") { + return new HtmlHeaderTagAction(*this, H5); + } else if (tag == "H6") { + return new HtmlHeaderTagAction(*this, H6); + } else if (tag == "HEAD") { + return new HtmlIgnoreTagAction(*this); + } else if (tag == "TITLE") { + return new HtmlIgnoreTagAction(*this); + } else if (tag == "STYLE") { + return new HtmlStyleTagAction(*this); + } else if (tag == "SELECT") { + return new HtmlIgnoreTagAction(*this); + } else if (tag == "SCRIPT") { + return new HtmlIgnoreTagAction(*this); + } else if (tag == "A") { + return new HtmlHrefTagAction(*this); + } else if (tag == "TD") { + //return new HtmlBreakTagAction(*this, HtmlBreakTagAction::BREAK_AT_END); + } else if (tag == "TR") { + return new HtmlBreakTagAction(*this, HtmlBreakTagAction::BREAK_AT_END); + } else if (tag == "DIV") { + return new HtmlBreakTagAction(*this, HtmlBreakTagAction::BREAK_AT_END); + } else if (tag == "DT") { + return new HtmlBreakTagAction(*this, HtmlBreakTagAction::BREAK_AT_START); + } else if (tag == "P") { + return new HtmlBreakTagAction(*this, HtmlBreakTagAction::BREAK_AT_START_AND_AT_END); + } else if (tag == "BR") { + return new HtmlBreakTagAction(*this, HtmlBreakTagAction::BREAK_AT_START_AND_AT_END); + } else if (tag == "IMG") { + return new HtmlImageTagAction(*this); + } else if (tag == "UL") { + return new HtmlListTagAction(*this, 0); + } else if (tag == "MENU") { + return new HtmlListTagAction(*this, 0); + } else if (tag == "DIR") { + return new HtmlListTagAction(*this, 0); + } else if (tag == "OL") { + return new HtmlListTagAction(*this, 1); + } else if (tag == "LI") { + return new HtmlListItemTagAction(*this); + } else if (tag == "PRE") { + if (myProcessPreTag) { + return new HtmlPreTagAction(*this); + } + } else if (tag == "TABLE") { + return new HtmlTableTagAction(*this); + } + /* + } else if (tag == "DD") { + return 0; + } else if (tag == "DL") { + return 0; + } else if (tag == "DFN") { + return 0; + } else if (tag == "SAMP") { + return 0; + } else if (tag == "KBD") { + return 0; + } else if (tag == "VAR") { + return 0; + } else if (tag == "ABBR") { + return 0; + } else if (tag == "ACRONYM") { + return 0; + } else if (tag == "BLOCKQUOTE") { + return 0; + } else if (tag == "Q") { + return 0; + } else if (tag == "INS") { + return 0; + } else if (tag == "DEL") { + return 0; + } else if (tag == "BODY") { + return 0; + */ + return new DummyHtmlTagAction(*this); +} + +void HtmlBookReader::setBuildTableOfContent(bool build) { + myBuildTableOfContent = build; +} + +void HtmlBookReader::setProcessPreTag(bool process) { + myProcessPreTag = process; +} + +HtmlBookReader::HtmlBookReader(const std::string &baseDirectoryPath, BookModel &model, const PlainTextFormat &format, const std::string &encoding) : HtmlReader(encoding), myBookReader(model), myBaseDirPath(baseDirectoryPath), myFormat(format), myBuildTableOfContent(true), myProcessPreTag(true) { +} + +HtmlBookReader::~HtmlBookReader() { +} + +void HtmlBookReader::addConvertedDataToBuffer(const char *text, std::size_t len, bool convert) { + if (len > 0) { + if (myDontBreakParagraph) { + while (len > 0 && std::isspace(*text)) { + --len; + ++text; + } + if (len == 0) { + return; + } + } + if (convert) { + myConverter->convert(myConverterBuffer, text, text + len); + myBookReader.addData(myConverterBuffer); + myBookReader.addContentsData(myConverterBuffer); + myConverterBuffer.erase(); + } else { + std::string strText(text, len); + myBookReader.addData(strText); + myBookReader.addContentsData(strText); + } + myDontBreakParagraph = false; + } +} + +bool HtmlBookReader::tagHandler(const HtmlTag &tag) { + myConverter->reset(); + + for (unsigned int i = 0; i < tag.Attributes.size(); ++i) { + if (tag.Attributes[i].Name == "ID") { + myBookReader.addHyperlinkLabel(tag.Attributes[i].Value); + break; + } + } + shared_ptr<HtmlTagAction> action = myActionMap[tag.Name]; + if (action.isNull()) { + action = createAction(tag.Name); + myActionMap[tag.Name] = action; + } + action->run(tag); + + return true; +} + +void HtmlBookReader::preformattedCharacterDataHandler(const char *text, std::size_t len, bool convert) { + const char *start = text; + const char *end = text + len; + + int breakType = myFormat.breakType(); + if (breakType & PlainTextFormat::BREAK_PARAGRAPH_AT_NEW_LINE) { + for (const char *ptr = text; ptr != end; ++ptr) { + if (*ptr == '\n') { + mySpaceCounter = 0; + if (start < ptr) { + addConvertedDataToBuffer(start, ptr - start, convert); + } else { + static const std::string SPACE = " "; + myBookReader.addData(SPACE); + } + myBookReader.endParagraph(); + myBookReader.beginParagraph(); + start = ptr + 1; + } else if (mySpaceCounter >= 0) { + if (std::isspace((unsigned char)*ptr)) { + ++mySpaceCounter; + } else { + myBookReader.addFixedHSpace(mySpaceCounter); + mySpaceCounter = -1; + } + } + } + addConvertedDataToBuffer(start, end - start, convert); + } else if (breakType & PlainTextFormat::BREAK_PARAGRAPH_AT_LINE_WITH_INDENT) { + for (const char *ptr = text; ptr != end; ++ptr) { + if (std::isspace((unsigned char)*ptr)) { + if (*ptr == '\n') { + mySpaceCounter = 0; + } else if (mySpaceCounter >= 0) { + ++mySpaceCounter; + } + } else { + if (mySpaceCounter > myFormat.ignoredIndent()) { + if (ptr - start > mySpaceCounter) { + addConvertedDataToBuffer(start, ptr - start - mySpaceCounter, convert); + myBookReader.endParagraph(); + myBookReader.beginParagraph(); + } + start = ptr; + } + mySpaceCounter = -1; + } + } + mySpaceCounter = std::max(mySpaceCounter, 0); + if (end - start > mySpaceCounter) { + addConvertedDataToBuffer(start, end - start - mySpaceCounter, convert); + } + } else if (breakType & PlainTextFormat::BREAK_PARAGRAPH_AT_EMPTY_LINE) { + for (const char *ptr = start; ptr != end; ++ptr) { + if (std::isspace((unsigned char)*ptr)) { + if (*ptr == '\n') { + ++myBreakCounter; + } + } else { + if (myBreakCounter > 1) { + addConvertedDataToBuffer(start, ptr - start, convert); + myBookReader.endParagraph(); + myBookReader.beginParagraph(); + start = ptr; + } + myBreakCounter = 0; + } + } + addConvertedDataToBuffer(start, end - start, convert); + } +} + +bool HtmlBookReader::characterDataHandler(const char *text, std::size_t len, bool convert) { + if (!myStyleSheetParser.isNull()) { + myStyleSheetParser->parse(text, len); + return true; + } + + if (myIgnoreDataCounter != 0) { + return true; + } + + if (myIsPreformatted) { + preformattedCharacterDataHandler(text, len, convert); + return true; + } + + const char *ptr = text; + const char *end = text + len; + if (!myIsStarted) { + for (; ptr != end; ++ptr) { + if (!std::isspace((unsigned char)*ptr)) { + myIsStarted = true; + break; + } + } + } + if (myIsStarted) { + addConvertedDataToBuffer(ptr, end - ptr, convert); + } + return true; +} + +void HtmlBookReader::startDocumentHandler() { + while (!myListNumStack.empty()) { + myListNumStack.pop(); + } + myConverterBuffer.erase(); + myKindList.clear(); + + myBookReader.reset(); + myBookReader.setMainTextModel(); + myBookReader.pushKind(REGULAR); + myBookReader.beginParagraph(); + myIgnoreDataCounter = 0; + myIsPreformatted = false; + myDontBreakParagraph = false; + for (std::map<std::string,shared_ptr<HtmlTagAction> >::const_iterator it = myActionMap.begin(); it != myActionMap.end(); ++it) { + it->second->reset(); + } + myIsStarted = false; + myIgnoreTitles = false; + + myStyleSheetParser = 0; + + mySpaceCounter = -1; + myBreakCounter = 0; +} + +void HtmlBookReader::endDocumentHandler() { + myBookReader.endParagraph(); +} + +void HtmlBookReader::setFileName(const std::string fileName) { + myFileName = fileName; +} |
