summaryrefslogtreecommitdiffstats
path: root/fbreader/src/formats/html/HtmlBookReader.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'fbreader/src/formats/html/HtmlBookReader.cpp')
-rw-r--r--fbreader/src/formats/html/HtmlBookReader.cpp583
1 files changed, 0 insertions, 583 deletions
diff --git a/fbreader/src/formats/html/HtmlBookReader.cpp b/fbreader/src/formats/html/HtmlBookReader.cpp
deleted file mode 100644
index 321913d..0000000
--- a/fbreader/src/formats/html/HtmlBookReader.cpp
+++ /dev/null
@@ -1,583 +0,0 @@
-/*
- * Copyright (C) 2004-2012 Geometer Plus <contact@geometerplus.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
- * 02110-1301, USA.
- */
-
-#include <cctype>
-
-#include <ZLFile.h>
-#include <ZLFileImage.h>
-#include <ZLStringUtil.h>
-
-#include "HtmlBookReader.h"
-#include "HtmlTagActions.h"
-#include "../txt/PlainTextFormat.h"
-#include "../util/MiscUtil.h"
-#include "../../bookmodel/BookModel.h"
-#include "../css/StyleSheetParser.h"
-
-HtmlTagAction::HtmlTagAction(HtmlBookReader &reader) : myReader(reader) {
-}
-
-HtmlTagAction::~HtmlTagAction() {
-}
-
-void HtmlTagAction::reset() {
-}
-
-DummyHtmlTagAction::DummyHtmlTagAction(HtmlBookReader &reader) : HtmlTagAction(reader) {
-}
-
-void DummyHtmlTagAction::run(const HtmlReader::HtmlTag&) {
-}
-
-HtmlControlTagAction::HtmlControlTagAction(HtmlBookReader &reader, FBTextKind kind) : HtmlTagAction(reader), myKind(kind) {
-}
-
-void HtmlControlTagAction::run(const HtmlReader::HtmlTag &tag) {
- std::vector<FBTextKind> &list = myReader.myKindList;
- int index;
- for (index = list.size() - 1; index >= 0; --index) {
- if (list[index] == myKind) {
- break;
- }
- }
- if (tag.Start) {
- if (index == -1) {
- bookReader().pushKind(myKind);
- myReader.myKindList.push_back(myKind);
- bookReader().addControl(myKind, true);
- }
- } else {
- if (index >= 0) {
- for (int i = list.size() - 1; i >= index; --i) {
- bookReader().addControl(list[i], false);
- bookReader().popKind();
- }
- for (unsigned int j = index + 1; j < list.size(); ++j) {
- bookReader().addControl(list[j], true);
- bookReader().pushKind(list[j]);
- }
- list.erase(list.begin() + index);
- }
- }
-}
-
-HtmlHeaderTagAction::HtmlHeaderTagAction(HtmlBookReader &reader, FBTextKind kind) : HtmlTagAction(reader), myKind(kind) {
-}
-
-void HtmlHeaderTagAction::run(const HtmlReader::HtmlTag &tag) {
- myReader.myIsStarted = false;
- if (tag.Start) {
- if (myReader.myBuildTableOfContent && !myReader.myIgnoreTitles) {
- if (!bookReader().contentsParagraphIsOpen()) {
- bookReader().insertEndOfSectionParagraph();
- bookReader().enterTitle();
- bookReader().beginContentsParagraph();
- }
- }
- bookReader().pushKind(myKind);
- } else {
- bookReader().popKind();
- if (myReader.myBuildTableOfContent && !myReader.myIgnoreTitles) {
- bookReader().endContentsParagraph();
- bookReader().exitTitle();
- }
- }
- bookReader().beginParagraph();
-}
-
-HtmlIgnoreTagAction::HtmlIgnoreTagAction(HtmlBookReader &reader) : HtmlTagAction(reader) {
-}
-
-void HtmlIgnoreTagAction::run(const HtmlReader::HtmlTag &tag) {
- if (tag.Start) {
- if (myTagNames.find(tag.Name) == myTagNames.end()) {
- ++myReader.myIgnoreDataCounter;
- myTagNames.insert(tag.Name);
- }
- } else {
- if (myTagNames.find(tag.Name) != myTagNames.end()) {
- --myReader.myIgnoreDataCounter;
- myTagNames.erase(tag.Name);
- }
- }
-}
-
-HtmlHrefTagAction::HtmlHrefTagAction(HtmlBookReader &reader) : HtmlTagAction(reader) {
-}
-
-void HtmlHrefTagAction::run(const HtmlReader::HtmlTag &tag) {
- if (tag.Start) {
- for (unsigned int i = 0; i < tag.Attributes.size(); ++i) {
- if (tag.Attributes[i].Name == "NAME") {
- bookReader().addHyperlinkLabel(tag.Attributes[i].Value);
- } else if ((hyperlinkType() == REGULAR) && (tag.Attributes[i].Name == "HREF")) {
- std::string value = tag.Attributes[i].Value;
- if (!myReader.myFileName.empty() &&
- (value.length() > myReader.myFileName.length()) &&
- (value.substr(0, myReader.myFileName.length()) == myReader.myFileName)) {
- value = value.substr(myReader.myFileName.length());
- }
- if (!value.empty()) {
- if (value[0] == '#') {
- setHyperlinkType(INTERNAL_HYPERLINK);
- bookReader().addHyperlinkControl(INTERNAL_HYPERLINK, value.substr(1));
- } else {
- FBTextKind hyperlinkType = MiscUtil::referenceType(value);
- if (hyperlinkType != INTERNAL_HYPERLINK) {
- setHyperlinkType(hyperlinkType);
- bookReader().addHyperlinkControl(hyperlinkType, value);
- }
- }
- }
- }
- }
- } else if (hyperlinkType() != REGULAR) {
- bookReader().addControl(hyperlinkType(), false);
- setHyperlinkType(REGULAR);
- }
-}
-
-void HtmlHrefTagAction::reset() {
- setHyperlinkType(REGULAR);
-}
-
-FBTextKind HtmlHrefTagAction::hyperlinkType() const {
- return myHyperlinkType;
-}
-
-void HtmlHrefTagAction::setHyperlinkType(FBTextKind hyperlinkType) {
- myHyperlinkType = hyperlinkType;
-}
-
-HtmlImageTagAction::HtmlImageTagAction(HtmlBookReader &reader) : HtmlTagAction(reader) {
-}
-
-void HtmlImageTagAction::run(const HtmlReader::HtmlTag &tag) {
- if (tag.Start) {
- bookReader().endParagraph();
- for (unsigned int i = 0; i < tag.Attributes.size(); ++i) {
- if (tag.Attributes[i].Name == "SRC") {
- const std::string fileName = MiscUtil::decodeHtmlURL(tag.Attributes[i].Value);
- const ZLFile file(myReader.myBaseDirPath + fileName);
- if (file.exists()) {
- bookReader().addImageReference(fileName);
- bookReader().addImage(fileName, new ZLFileImage(file, 0));
- }
- break;
- }
- }
- bookReader().beginParagraph();
- }
-}
-
-HtmlBreakTagAction::HtmlBreakTagAction(HtmlBookReader &reader, BreakType breakType) : HtmlTagAction(reader), myBreakType(breakType) {
-}
-
-void HtmlBreakTagAction::run(const HtmlReader::HtmlTag &tag) {
- if (myReader.myDontBreakParagraph) {
- myReader.myDontBreakParagraph = false;
- return;
- }
-
- if ((tag.Start && (myBreakType & BREAK_AT_START)) ||
- (!tag.Start && (myBreakType & BREAK_AT_END))) {
- bookReader().endParagraph();
- if (bookReader().isKindStackEmpty()) {
- bookReader().pushKind(REGULAR);
- }
- bookReader().beginParagraph();
- }
-}
-
-HtmlPreTagAction::HtmlPreTagAction(HtmlBookReader &reader) : HtmlTagAction(reader) {
-}
-
-void HtmlPreTagAction::run(const HtmlReader::HtmlTag &tag) {
- bookReader().endParagraph();
- myReader.myIsPreformatted = tag.Start;
- myReader.mySpaceCounter = -1;
- myReader.myBreakCounter = 0;
- if (myReader.myFormat.breakType() == PlainTextFormat::BREAK_PARAGRAPH_AT_NEW_LINE) {
- if (tag.Start) {
- bookReader().pushKind(PREFORMATTED);
- } else {
- bookReader().popKind();
- }
- }
- bookReader().beginParagraph();
-}
-
-HtmlListTagAction::HtmlListTagAction(HtmlBookReader &reader, int startIndex) : HtmlTagAction(reader), myStartIndex(startIndex) {
-}
-
-void HtmlListTagAction::run(const HtmlReader::HtmlTag &tag) {
- if (tag.Start) {
- myReader.myListNumStack.push(myStartIndex);
- } else if (!myReader.myListNumStack.empty()) {
- myReader.myListNumStack.pop();
- }
-}
-
-HtmlListItemTagAction::HtmlListItemTagAction(HtmlBookReader &reader) : HtmlTagAction(reader) {
-}
-
-void HtmlListItemTagAction::run(const HtmlReader::HtmlTag &tag) {
- if (tag.Start) {
- bookReader().endParagraph();
- bookReader().beginParagraph();
- if (!myReader.myListNumStack.empty()) {
- bookReader().addFixedHSpace(3 * myReader.myListNumStack.size());
- int &index = myReader.myListNumStack.top();
- if (index == 0) {
- myReader.addConvertedDataToBuffer("\342\200\242 ", 4, false);
- } else {
- std::string number;
- ZLStringUtil::appendNumber(number, index++);
- number += ". ";
- myReader.addConvertedDataToBuffer(number.data(), number.length(), false);
- }
- myReader.myDontBreakParagraph = true;
- }
- } else {
- myReader.myDontBreakParagraph = false;
- }
-}
-
-HtmlTableTagAction::HtmlTableTagAction(HtmlBookReader &reader) : HtmlTagAction(reader) {
-}
-
-void HtmlTableTagAction::run(const HtmlReader::HtmlTag &tag) {
- if (tag.Start) {
- myReader.myIgnoreTitles = true;
- } else {
- myReader.myIgnoreTitles = false;
- }
-}
-
-HtmlStyleTagAction::HtmlStyleTagAction(HtmlBookReader &reader) : HtmlTagAction(reader) {
-}
-
-void HtmlStyleTagAction::run(const HtmlReader::HtmlTag &tag) {
- myReader.myStyleSheetParser = tag.Start ? new StyleSheetTableParser(myReader.myStyleSheetTable) : 0;
- /*
- if (!tag.Start) {
- myReader.myStyleSheetTable.dump();
- }
- */
-}
-
-shared_ptr<HtmlTagAction> HtmlBookReader::createAction(const std::string &tag) {
- if (tag == "EM") {
- return new HtmlControlTagAction(*this, EMPHASIS);
- } else if (tag == "STRONG") {
- return new HtmlControlTagAction(*this, STRONG);
- } else if (tag == "B") {
- return new HtmlControlTagAction(*this, BOLD);
- } else if (tag == "I") {
- return new HtmlControlTagAction(*this, ITALIC);
- } else if (tag == "TT") {
- return new HtmlControlTagAction(*this, CODE);
- } else if (tag == "CODE") {
- return new HtmlControlTagAction(*this, CODE);
- } else if (tag == "CITE") {
- return new HtmlControlTagAction(*this, CITE);
- } else if (tag == "SUB") {
- return new HtmlControlTagAction(*this, SUB);
- } else if (tag == "SUP") {
- return new HtmlControlTagAction(*this, SUP);
- } else if (tag == "H1") {
- return new HtmlHeaderTagAction(*this, H1);
- } else if (tag == "H2") {
- return new HtmlHeaderTagAction(*this, H2);
- } else if (tag == "H3") {
- return new HtmlHeaderTagAction(*this, H3);
- } else if (tag == "H4") {
- return new HtmlHeaderTagAction(*this, H4);
- } else if (tag == "H5") {
- return new HtmlHeaderTagAction(*this, H5);
- } else if (tag == "H6") {
- return new HtmlHeaderTagAction(*this, H6);
- } else if (tag == "HEAD") {
- return new HtmlIgnoreTagAction(*this);
- } else if (tag == "TITLE") {
- return new HtmlIgnoreTagAction(*this);
- } else if (tag == "STYLE") {
- return new HtmlStyleTagAction(*this);
- } else if (tag == "SELECT") {
- return new HtmlIgnoreTagAction(*this);
- } else if (tag == "SCRIPT") {
- return new HtmlIgnoreTagAction(*this);
- } else if (tag == "A") {
- return new HtmlHrefTagAction(*this);
- } else if (tag == "TD") {
- //return new HtmlBreakTagAction(*this, HtmlBreakTagAction::BREAK_AT_END);
- } else if (tag == "TR") {
- return new HtmlBreakTagAction(*this, HtmlBreakTagAction::BREAK_AT_END);
- } else if (tag == "DIV") {
- return new HtmlBreakTagAction(*this, HtmlBreakTagAction::BREAK_AT_END);
- } else if (tag == "DT") {
- return new HtmlBreakTagAction(*this, HtmlBreakTagAction::BREAK_AT_START);
- } else if (tag == "P") {
- return new HtmlBreakTagAction(*this, HtmlBreakTagAction::BREAK_AT_START_AND_AT_END);
- } else if (tag == "BR") {
- return new HtmlBreakTagAction(*this, HtmlBreakTagAction::BREAK_AT_START_AND_AT_END);
- } else if (tag == "IMG") {
- return new HtmlImageTagAction(*this);
- } else if (tag == "UL") {
- return new HtmlListTagAction(*this, 0);
- } else if (tag == "MENU") {
- return new HtmlListTagAction(*this, 0);
- } else if (tag == "DIR") {
- return new HtmlListTagAction(*this, 0);
- } else if (tag == "OL") {
- return new HtmlListTagAction(*this, 1);
- } else if (tag == "LI") {
- return new HtmlListItemTagAction(*this);
- } else if (tag == "PRE") {
- if (myProcessPreTag) {
- return new HtmlPreTagAction(*this);
- }
- } else if (tag == "TABLE") {
- return new HtmlTableTagAction(*this);
- }
- /*
- } else if (tag == "DD") {
- return 0;
- } else if (tag == "DL") {
- return 0;
- } else if (tag == "DFN") {
- return 0;
- } else if (tag == "SAMP") {
- return 0;
- } else if (tag == "KBD") {
- return 0;
- } else if (tag == "VAR") {
- return 0;
- } else if (tag == "ABBR") {
- return 0;
- } else if (tag == "ACRONYM") {
- return 0;
- } else if (tag == "BLOCKQUOTE") {
- return 0;
- } else if (tag == "Q") {
- return 0;
- } else if (tag == "INS") {
- return 0;
- } else if (tag == "DEL") {
- return 0;
- } else if (tag == "BODY") {
- return 0;
- */
- return new DummyHtmlTagAction(*this);
-}
-
-void HtmlBookReader::setBuildTableOfContent(bool build) {
- myBuildTableOfContent = build;
-}
-
-void HtmlBookReader::setProcessPreTag(bool process) {
- myProcessPreTag = process;
-}
-
-HtmlBookReader::HtmlBookReader(const std::string &baseDirectoryPath, BookModel &model, const PlainTextFormat &format, const std::string &encoding) : HtmlReader(encoding), myBookReader(model), myBaseDirPath(baseDirectoryPath), myFormat(format), myBuildTableOfContent(true), myProcessPreTag(true) {
-}
-
-HtmlBookReader::~HtmlBookReader() {
-}
-
-void HtmlBookReader::addConvertedDataToBuffer(const char *text, std::size_t len, bool convert) {
- if (len > 0) {
- if (myDontBreakParagraph) {
- while (len > 0 && std::isspace(*text)) {
- --len;
- ++text;
- }
- if (len == 0) {
- return;
- }
- }
- if (convert) {
- myConverter->convert(myConverterBuffer, text, text + len);
- myBookReader.addData(myConverterBuffer);
- myBookReader.addContentsData(myConverterBuffer);
- myConverterBuffer.erase();
- } else {
- std::string strText(text, len);
- myBookReader.addData(strText);
- myBookReader.addContentsData(strText);
- }
- myDontBreakParagraph = false;
- }
-}
-
-bool HtmlBookReader::tagHandler(const HtmlTag &tag) {
- myConverter->reset();
-
- for (unsigned int i = 0; i < tag.Attributes.size(); ++i) {
- if (tag.Attributes[i].Name == "ID") {
- myBookReader.addHyperlinkLabel(tag.Attributes[i].Value);
- break;
- }
- }
- shared_ptr<HtmlTagAction> action = myActionMap[tag.Name];
- if (action.isNull()) {
- action = createAction(tag.Name);
- myActionMap[tag.Name] = action;
- }
- action->run(tag);
-
- return true;
-}
-
-void HtmlBookReader::preformattedCharacterDataHandler(const char *text, std::size_t len, bool convert) {
- const char *start = text;
- const char *end = text + len;
-
- int breakType = myFormat.breakType();
- if (breakType & PlainTextFormat::BREAK_PARAGRAPH_AT_NEW_LINE) {
- for (const char *ptr = text; ptr != end; ++ptr) {
- if (*ptr == '\n') {
- mySpaceCounter = 0;
- if (start < ptr) {
- addConvertedDataToBuffer(start, ptr - start, convert);
- } else {
- static const std::string SPACE = " ";
- myBookReader.addData(SPACE);
- }
- myBookReader.endParagraph();
- myBookReader.beginParagraph();
- start = ptr + 1;
- } else if (mySpaceCounter >= 0) {
- if (std::isspace((unsigned char)*ptr)) {
- ++mySpaceCounter;
- } else {
- myBookReader.addFixedHSpace(mySpaceCounter);
- mySpaceCounter = -1;
- }
- }
- }
- addConvertedDataToBuffer(start, end - start, convert);
- } else if (breakType & PlainTextFormat::BREAK_PARAGRAPH_AT_LINE_WITH_INDENT) {
- for (const char *ptr = text; ptr != end; ++ptr) {
- if (std::isspace((unsigned char)*ptr)) {
- if (*ptr == '\n') {
- mySpaceCounter = 0;
- } else if (mySpaceCounter >= 0) {
- ++mySpaceCounter;
- }
- } else {
- if (mySpaceCounter > myFormat.ignoredIndent()) {
- if (ptr - start > mySpaceCounter) {
- addConvertedDataToBuffer(start, ptr - start - mySpaceCounter, convert);
- myBookReader.endParagraph();
- myBookReader.beginParagraph();
- }
- start = ptr;
- }
- mySpaceCounter = -1;
- }
- }
- mySpaceCounter = std::max(mySpaceCounter, 0);
- if (end - start > mySpaceCounter) {
- addConvertedDataToBuffer(start, end - start - mySpaceCounter, convert);
- }
- } else if (breakType & PlainTextFormat::BREAK_PARAGRAPH_AT_EMPTY_LINE) {
- for (const char *ptr = start; ptr != end; ++ptr) {
- if (std::isspace((unsigned char)*ptr)) {
- if (*ptr == '\n') {
- ++myBreakCounter;
- }
- } else {
- if (myBreakCounter > 1) {
- addConvertedDataToBuffer(start, ptr - start, convert);
- myBookReader.endParagraph();
- myBookReader.beginParagraph();
- start = ptr;
- }
- myBreakCounter = 0;
- }
- }
- addConvertedDataToBuffer(start, end - start, convert);
- }
-}
-
-bool HtmlBookReader::characterDataHandler(const char *text, std::size_t len, bool convert) {
- if (!myStyleSheetParser.isNull()) {
- myStyleSheetParser->parse(text, len);
- return true;
- }
-
- if (myIgnoreDataCounter != 0) {
- return true;
- }
-
- if (myIsPreformatted) {
- preformattedCharacterDataHandler(text, len, convert);
- return true;
- }
-
- const char *ptr = text;
- const char *end = text + len;
- if (!myIsStarted) {
- for (; ptr != end; ++ptr) {
- if (!std::isspace((unsigned char)*ptr)) {
- myIsStarted = true;
- break;
- }
- }
- }
- if (myIsStarted) {
- addConvertedDataToBuffer(ptr, end - ptr, convert);
- }
- return true;
-}
-
-void HtmlBookReader::startDocumentHandler() {
- while (!myListNumStack.empty()) {
- myListNumStack.pop();
- }
- myConverterBuffer.erase();
- myKindList.clear();
-
- myBookReader.reset();
- myBookReader.setMainTextModel();
- myBookReader.pushKind(REGULAR);
- myBookReader.beginParagraph();
- myIgnoreDataCounter = 0;
- myIsPreformatted = false;
- myDontBreakParagraph = false;
- for (std::map<std::string,shared_ptr<HtmlTagAction> >::const_iterator it = myActionMap.begin(); it != myActionMap.end(); ++it) {
- it->second->reset();
- }
- myIsStarted = false;
- myIgnoreTitles = false;
-
- myStyleSheetParser = 0;
-
- mySpaceCounter = -1;
- myBreakCounter = 0;
-}
-
-void HtmlBookReader::endDocumentHandler() {
- myBookReader.endParagraph();
-}
-
-void HtmlBookReader::setFileName(const std::string fileName) {
- myFileName = fileName;
-}