/*************************************************************************** * Copyright (C) 2004-2009 by Thomas Fischer * * fischer@unix-ag.uni-kl.de * * * * This program is free software; you can redistribute it and/or modify * * it under the terms of the GNU General Public License as published by * * the Free Software Foundation; either version 2 of the License, or * * (at your option) any later version. * * * * This program is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * * GNU General Public License for more details. * * * * You should have received a copy of the GNU General Public License * * along with this program; if not, write to the * * Free Software Foundation, Inc., * * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * ***************************************************************************/ #include #include #include #include "file.h" #include "comment.h" #include "macro.h" #include "preamble.h" #include "entry.h" #include "element.h" #include "encoderlatex.h" #include "value.h" #include "fileimporterbibtex.h" #define max(a,b) ((a)<(b)?(b):(a)) namespace BibTeX { const TQString extraAlphaNumChars = TQString( "?'`-_:.+/$\\\"&" ); const TQRegExp htmlRegExp = TQRegExp( "]*>", false ); FileImporterBibTeX::FileImporterBibTeX( bool personFirstNameFirst, TQString encoding ) : FileImporter(), m_personFirstNameFirst( personFirstNameFirst ), m_currentChar( ' ' ), m_ignoreComments( FALSE ), m_lineBufferSize( 4096 ), m_encoding( encoding ) { cancelFlag = FALSE; m_lineBuffer = new char[m_lineBufferSize]; m_textStream = NULL; } FileImporterBibTeX::~FileImporterBibTeX() { delete[] m_lineBuffer; } File* FileImporterBibTeX::load( TQIODevice *iodevice ) { m_mutex.lock(); cancelFlag = FALSE; TQString rawText; const char *encodingFrom = m_encoding == "latex" ? "utf-8\0" : m_encoding.append( "\0" ).ascii(); iconv_t iconvHandle = iconv_open( "utf-8", encodingFrom ); char *convertedLine = new char[m_lineBufferSize * 4]; int len; bool encodingOk = true; while ( encodingOk && iodevice->isReadable() && ( len = iodevice->readLine( m_lineBuffer, m_lineBufferSize ) ) > 0 ) { evaluateParameterComments( iconvHandle, m_lineBuffer ); char *raw = m_lineBuffer; char *enc = convertedLine; size_t encLen = m_lineBufferSize, rawLen = ( size_t )len; size_t result = iconv( iconvHandle, &raw, &rawLen, &enc, &encLen ); tqApp->processEvents(); if ( result != 0 ) { TQString problematic = TQString( m_lineBuffer ).mid( max( 0, m_lineBufferSize - encLen - 15 ), 30 ); if ( problematic.isNull() || problematic.isEmpty() ) problematic = TQString( m_lineBuffer ); tqDebug( "iconv resulted in error code %i for source encoding %s, maybe file is in different encoding? Problem is somewhere here: \"%s\"", result, encodingFrom, problematic.latin1() ); encodingOk = false; break; } if ( rawLen > 0 ) { tqDebug( "iconv could not convert complete string, only %i out of %i chars", len - rawLen, len ); encodingOk = false; break; } enc[0] = '\0'; /** remove leading UTF-8 byte-order mark (BOM) */ int offset = 0; while (((( unsigned char )convertedLine[offset] ) == 0xef || (( unsigned char )convertedLine[offset] ) == 0xbb || (( unsigned char )convertedLine[offset] ) == 0xbf ) && offset < 4 ) ++offset; TQString line = TQString::fromUtf8( convertedLine + offset ); rawText.append( line ); } iconv_close( iconvHandle ); delete[] convertedLine; if ( !encodingOk ) { tqDebug( "Decoding failed, cannot load file. Please fix encoding manually." ); m_mutex.unlock(); return NULL; } /** Cleaning up code comming from DBLP */ rawText = rawText.replace( htmlRegExp, "" ); rawText = EncoderLaTeX::currentEncoderLaTeX() ->decode( rawText ); unescapeLaTeXChars( rawText ); m_textStream = new TQTextStream( rawText, IO_ReadOnly ); m_textStream->setEncoding( TQTextStream::UnicodeUTF8 ); m_currentLineNumber = 0; m_posIntCurrentLine = 0; m_currentLine = ""; File *result = new File(); TQIODevice *streamDevice = m_textStream->device(); while ( !cancelFlag && !m_textStream->atEnd() ) { emit progress( streamDevice->at(), streamDevice->size() ); tqApp->processEvents(); Element * element = nextElement(); if ( element != NULL ) { Comment *comment = dynamic_cast( element ); if ( !m_ignoreComments || comment == NULL ) result->appendElement( element ); else delete element; } tqApp->processEvents(); } emit progress( streamDevice->size(), streamDevice->size() ); if ( cancelFlag ) { tqDebug( "Loading file has been canceled" ); delete result; result = NULL; } delete m_textStream; m_mutex.unlock(); return result; } bool FileImporterBibTeX::guessCanDecode( const TQString & rawText ) { TQString text = EncoderLaTeX::currentEncoderLaTeX() ->decode( rawText ); return text.find( TQRegExp( "@\\w+\\{.+\\}" ) ) >= 0; } void FileImporterBibTeX::setIgnoreComments( bool ignoreComments ) { m_ignoreComments = ignoreComments; } void FileImporterBibTeX::cancel() { cancelFlag = TRUE; } Element *FileImporterBibTeX::nextElement() { Token token = nextToken(); if ( token == tAt ) { TQString elementType = readSimpleString(); if ( elementType.lower() == "comment" ) return readCommentElement(); else if ( elementType.lower() == "string" ) return readMacroElement(); else if ( elementType.lower() == "preamble" ) return readPreambleElement(); else if ( !elementType.isEmpty() ) return readEntryElement( elementType ); else { tqDebug( "ElementType is empty" ); return NULL; } } else if ( token == tUnknown ) { tqDebug( "Unknown token near line %i, treating as comment", m_currentLineNumber ); return readPlainCommentElement(); } if ( token != tEOF ) tqDebug( "Don't know how to parse next token near line %i: %s", m_currentLineNumber, tokenidToString( token ).latin1() ); return NULL; } Comment *FileImporterBibTeX::readCommentElement() { while ( m_currentChar != '{' && m_currentChar != '(' && !m_textStream->atEnd() ) m_currentChar = nextChar(); return new Comment( readBracketString( m_currentChar ), TRUE ); } Comment *FileImporterBibTeX::readPlainCommentElement() { TQString result = m_currentChar; result += readLine(); m_currentChar = nextChar(); while ( !m_textStream->atEnd() && m_currentChar != '@' && !m_currentChar.isSpace() ) { result.append( '\n' ).append( m_currentChar ); m_currentChar = nextChar(); result.append( readLine() ); m_currentChar = nextChar(); } return new Comment( result, FALSE ); } Macro *FileImporterBibTeX::readMacroElement() { Token token = nextToken(); while ( token != tBracketOpen ) { if ( token == tEOF ) { tqDebug( "Error in parsing unknown macro (near line %i): Opening curly brace ({) expected", m_currentLineNumber ); return NULL; } token = nextToken(); } TQString key = readSimpleString(); if ( nextToken() != tAssign ) { tqDebug( "Error in parsing macro '%s' (near line %i): Assign symbol (=) expected", key.latin1(), m_currentLineNumber ); return NULL; } Macro *macro = new Macro( key ); do { bool isStringKey = FALSE; TQString text = readString( isStringKey ).replace( TQRegExp( "\\s+" ), " " ); if ( isStringKey ) macro->value()->items.append( new MacroKey( text ) ); else macro->value()->items.append( new BibTeX::PlainText( text ) ); token = nextToken(); } while ( token == tDoublecross ); return macro; } Preamble *FileImporterBibTeX::readPreambleElement() { Token token = nextToken(); while ( token != tBracketOpen ) { if ( token == tEOF ) { tqDebug( "Error in parsing unknown preamble (near line %i): Opening curly brace ({) expected", m_currentLineNumber ); return NULL; } token = nextToken(); } Preamble *preamble = new Preamble( ); do { bool isStringKey = FALSE; TQString text = readString( isStringKey ).replace( TQRegExp( "\\s+" ), " " ); if ( isStringKey ) preamble->value()->items.append( new MacroKey( text ) ); else preamble->value()->items.append( new BibTeX::PlainText( text ) ); token = nextToken(); } while ( token == tDoublecross ); return preamble; } Entry *FileImporterBibTeX::readEntryElement( const TQString& typeString ) { Token token = nextToken(); while ( token != tBracketOpen ) { if ( token == tEOF ) { tqDebug( "Error in parsing unknown entry (near line %i): Opening curly brace ({) expected", m_currentLineNumber ); return NULL; } token = nextToken(); } TQString key = readSimpleString(); Entry *entry = new Entry( typeString, key ); token = nextToken(); do { if ( token == tBracketClose || token == tEOF ) break; else if ( token != tComma ) { tqDebug( "Error in parsing entry '%s' (near line %i): Comma symbol (,) expected but got 0x%x (token %s)", key.latin1(), m_currentLineNumber, m_currentChar.unicode(), tokenidToString( token ).latin1() ); delete entry; return NULL; } TQString fieldTypeName = readSimpleString(); token = nextToken(); if ( fieldTypeName == TQString::null || token == tBracketClose ) { // entry is buggy, but we still accept it break; } else if ( token != tAssign ) { tqDebug( "Error in parsing entry '%s' (near line %i): Assign symbol (=) expected after field name '%s'", key.latin1(), m_currentLineNumber, fieldTypeName.latin1() ); delete entry; return NULL; } /** check for duplicate fields */ if ( entry->getField( fieldTypeName ) != NULL ) { int i = 1; TQString appendix = TQString::number( i ); while ( entry->getField( fieldTypeName + appendix ) != NULL ) { ++i; appendix = TQString::number( i ); } fieldTypeName += appendix; } EntryField *entryField = new EntryField( fieldTypeName ); token = readValue( entryField->value(), entryField->fieldType() ); entry->addField( entryField ); } while ( TRUE ); return entry; } FileImporterBibTeX::Token FileImporterBibTeX::nextToken() { if ( m_textStream->atEnd() ) return tEOF; Token curToken = tUnknown; while (( m_currentChar.isSpace() || m_currentChar == '\t' ) && !m_textStream->atEnd() ) m_currentChar = nextChar(); switch ( m_currentChar.latin1() ) { case '@': curToken = tAt; break; case '{': case '(': curToken = tBracketOpen; break; case '}': case ')': curToken = tBracketClose; break; case ',': curToken = tComma; break; case '=': curToken = tAssign; break; case '#': curToken = tDoublecross; break; default: if ( m_textStream->atEnd() ) curToken = tEOF; } if ( curToken != tUnknown && curToken != tEOF ) m_currentChar = nextChar(); return curToken; } TQString FileImporterBibTeX::readString( bool &isStringKey ) { while ( m_currentChar.isSpace() ) m_currentChar = nextChar(); isStringKey = FALSE; switch ( m_currentChar.latin1() ) { case '{': case '(': return readBracketString( m_currentChar ); case '"': return readQuotedString(); default: isStringKey = TRUE; return readSimpleString(); } } TQString FileImporterBibTeX::readSimpleString( TQChar until ) { TQString result; while ( m_currentChar.isSpace() ) m_currentChar = nextChar(); if ( m_currentChar.isLetterOrNumber() || extraAlphaNumChars.contains( m_currentChar ) ) { result.append( m_currentChar ); m_currentChar = nextChar(); } while ( !m_textStream->atEnd() ) { if ( until != '\0' ) { if ( m_currentChar != until ) result.append( m_currentChar ); else break; } else if ( m_currentChar.isLetterOrNumber() || extraAlphaNumChars.contains( m_currentChar ) ) result.append( m_currentChar ); else if ( m_currentChar == "," || m_currentChar == "(" || m_currentChar == ")" || m_currentChar == "{" || m_currentChar == "}" || m_currentChar == "=" || m_currentChar == "#" || m_currentChar == "@" || m_currentChar.isSpace() ) break; else { tqDebug( "Unknown letter or number: 0x%x", m_currentChar.unicode() ); // break; } m_currentChar = nextChar(); } return result; } TQString FileImporterBibTeX::readQuotedString() { TQString result; TQChar lastChar = m_currentChar; m_currentChar = nextChar(); while ( !m_textStream->atEnd() ) { if ( m_currentChar != '"' || lastChar == '\\' ) result.append( m_currentChar ); else break; lastChar = m_currentChar; m_currentChar = nextChar(); } /** read character after closing " */ m_currentChar = nextChar(); return result; } TQString FileImporterBibTeX::readLine() { TQString result = m_currentLine.mid( m_posIntCurrentLine ); m_posIntCurrentLine = m_currentLine.length() + 2; return result; } TQString FileImporterBibTeX::readBracketString( const TQChar openingBracket ) { TQString result; TQChar closingBracket = '}'; if ( openingBracket == '(' ) closingBracket = ')'; int counter = 1; m_currentChar = nextChar(); while ( !m_textStream->atEnd() ) { if ( m_currentChar == openingBracket ) counter++; else if ( m_currentChar == closingBracket ) counter--; if ( counter == 0 ) break; else result.append( m_currentChar ); m_currentChar = nextChar(); } m_currentChar = nextChar(); return result; } FileImporterBibTeX::Token FileImporterBibTeX::readValue( Value *value, EntryField::FieldType fieldType ) { Token token = tUnknown; do { bool isStringKey = FALSE; TQString text = readString( isStringKey ).replace( TQRegExp( "\\s+" ), " " ); switch ( fieldType ) { case EntryField::ftKeywords: { if ( isStringKey ) tqDebug( "WARNING: Cannot handle keywords that are macros" ); else value->items.append( new KeywordContainer( text ) ); } break; case EntryField::ftAuthor: case EntryField::ftEditor: { if ( isStringKey ) tqDebug( "WARNING: Cannot handle authors/editors that are macros" ); else { TQStringList persons; splitPersons( text, persons ); PersonContainer *container = new PersonContainer( m_personFirstNameFirst ); for ( TQStringList::ConstIterator pit = persons.constBegin(); pit != persons.constEnd(); ++pit ) container->persons.append( new Person( *pit, m_personFirstNameFirst ) ); value->items.append( container ); } } break; case EntryField::ftPages: text.replace( TQRegExp( "\\s*--?\\s*" ), TQChar( 0x2013 ) ); default: { if ( isStringKey ) value->items.append( new MacroKey( text ) ); else value->items.append( new BibTeX::PlainText( text ) ); } } token = nextToken(); } while ( token == tDoublecross ); return token; } void FileImporterBibTeX::unescapeLaTeXChars( TQString &text ) { text.replace( "\\&", "&" ); } void FileImporterBibTeX::splitPersons( const TQString& text, TQStringList &persons ) { TQStringList wordList; TQString word; int bracketCounter = 0; for ( unsigned int pos = 0;pos < text.length();++pos ) { if ( text[pos] == '{' ) ++bracketCounter; else if ( text[pos] == '}' ) --bracketCounter; if ( text[pos] == ' ' || text[pos] == '\n' || text[pos] == '\r' ) { if ( word == "and" && bracketCounter == 0 ) { persons.append( wordList.join( " " ) ); wordList.clear(); } else if ( !word.isEmpty() ) wordList.append( word ); word = ""; } else word.append( text[pos] ); } wordList.append( word ); persons.append( wordList.join( " " ) ); } void FileImporterBibTeX::evaluateParameterComments( iconv_t &iconvHandle, const char *cline ) { /** simple preliminary checks before expensive conversion to TQString */ if ( cline[0] == '@' && cline[1] == 'c' ) { TQString line = TQString( cline ).lower(); /** check if this file requests a special encoding */ if ( line.startsWith( "@comment{x-kbibtex-encoding=" ) && line.endsWith( "}\n" ) ) { TQString newEncoding = line.mid( 28, line.length() - 30 ); tqDebug( "x-kbibtex-encoding=<%s>", newEncoding.latin1() ); if ( newEncoding == "latex" ) newEncoding = "utf-8"; iconv_close( iconvHandle ); iconvHandle = iconv_open( "utf-8", newEncoding.append( '\0' ).ascii() ); } } } TQChar FileImporterBibTeX::nextChar() { bool atEndOfLine = m_posIntCurrentLine >= m_currentLine.length(); while (( m_posIntCurrentLine >= m_currentLine.length() || m_currentLine.isEmpty() || m_currentLine.isNull() ) && !m_textStream->atEnd() ) { m_currentLine = m_textStream->readLine(); m_posIntCurrentLine = 0; ++m_currentLineNumber; } if ( atEndOfLine ) return TQChar( ' ' ); else if ( m_posIntCurrentLine < m_currentLine.length() ) { TQChar result = m_currentLine[m_posIntCurrentLine]; ++m_posIntCurrentLine; return result; } return TQChar(); } TQString FileImporterBibTeX::tokenidToString( Token token ) { switch ( token ) { case tAt: return TQString( "At" ); case tBracketClose: return TQString( "BracketClose" ); case tBracketOpen: return TQString( "BracketOpen" ); case tAlphaNumText: return TQString( "AlphaNumText" ); case tAssign: return TQString( "Assign" ); case tComma: return TQString( "Comma" ); case tDoublecross: return TQString( "Doublecross" ); case tEOF: return TQString( "EOF" ); case tUnknown: return TQString( "Unknown" ); default: return TQString( "" ); } } }