summaryrefslogtreecommitdiffstats
path: root/src/fileimporterbibtex.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/fileimporterbibtex.cpp')
-rw-r--r--src/fileimporterbibtex.cpp658
1 files changed, 658 insertions, 0 deletions
diff --git a/src/fileimporterbibtex.cpp b/src/fileimporterbibtex.cpp
new file mode 100644
index 0000000..5312f0c
--- /dev/null
+++ b/src/fileimporterbibtex.cpp
@@ -0,0 +1,658 @@
+/***************************************************************************
+* Copyright (C) 2004-2009 by Thomas Fischer *
+* fischer@unix-ag.uni-kl.de *
+* *
+* This program is free software; you can redistribute it and/or modify *
+* it under the terms of the GNU General Public License as published by *
+* the Free Software Foundation; either version 2 of the License, or *
+* (at your option) any later version. *
+* *
+* This program is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
+* GNU General Public License for more details. *
+* *
+* You should have received a copy of the GNU General Public License *
+* along with this program; if not, write to the *
+* Free Software Foundation, Inc., *
+* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *
+***************************************************************************/
+#include <qiodevice.h>
+#include <qregexp.h>
+#include <qapplication.h>
+
+#include <file.h>
+#include <comment.h>
+#include <macro.h>
+#include <preamble.h>
+#include <entry.h>
+#include <element.h>
+#include <encoderlatex.h>
+#include <value.h>
+
+#include "fileimporterbibtex.h"
+
+#define max(a,b) ((a)<(b)?(b):(a))
+
+namespace BibTeX
+{
+ const QString extraAlphaNumChars = QString( "?'`-_:.+/$\\\"&" );
+ const QRegExp htmlRegExp = QRegExp( "</?(a|pre)[^>]*>", false );
+
+ FileImporterBibTeX::FileImporterBibTeX( bool personFirstNameFirst, QString encoding ) : FileImporter(), m_personFirstNameFirst( personFirstNameFirst ), m_currentChar( ' ' ), m_ignoreComments( FALSE ), m_lineBufferSize( 4096 ), m_encoding( encoding )
+ {
+ cancelFlag = FALSE;
+ m_lineBuffer = new char[m_lineBufferSize];
+ m_textStream = NULL;
+ }
+
+
+ FileImporterBibTeX::~FileImporterBibTeX()
+ {
+ delete[] m_lineBuffer;
+ }
+
+ File* FileImporterBibTeX::load( QIODevice *iodevice )
+ {
+ m_mutex.lock();
+ cancelFlag = FALSE;
+
+ QString rawText;
+ const char *encodingFrom = m_encoding == "latex" ? "utf-8\0" : m_encoding.append( "\0" ).ascii();
+ iconv_t iconvHandle = iconv_open( "utf-8", encodingFrom );
+ char *convertedLine = new char[m_lineBufferSize * 4];
+ int len;
+ bool encodingOk = true;
+ while ( encodingOk && iodevice->isReadable() && ( len = iodevice->readLine( m_lineBuffer, m_lineBufferSize ) ) > 0 )
+ {
+ evaluateParameterComments( iconvHandle, m_lineBuffer );
+
+ char *raw = m_lineBuffer;
+ char *enc = convertedLine;
+ size_t encLen = m_lineBufferSize, rawLen = ( size_t )len;
+ size_t result = iconv( iconvHandle, &raw, &rawLen, &enc, &encLen );
+
+ qApp->processEvents();
+
+ if ( result != 0 )
+ {
+ QString problematic = QString( m_lineBuffer ).mid( max( 0, m_lineBufferSize - encLen - 15 ), 30 );
+ if ( problematic.isNull() || problematic.isEmpty() ) problematic = QString( m_lineBuffer );
+ qDebug( "iconv resulted in error code %i for source encoding %s, maybe file is in different encoding? Problem is somewhere here: \"%s\"", result, encodingFrom, problematic.latin1() );
+ encodingOk = false;
+ break;
+ }
+ if ( rawLen > 0 )
+ {
+ qDebug( "iconv could not convert complete string, only %i out of %i chars", len - rawLen, len );
+ encodingOk = false;
+ break;
+ }
+ enc[0] = '\0';
+
+ /** remove leading UTF-8 byte-order mark (BOM) */
+ int offset = 0;
+ while (((( unsigned char )convertedLine[offset] ) == 0xef || (( unsigned char )convertedLine[offset] ) == 0xbb || (( unsigned char )convertedLine[offset] ) == 0xbf ) && offset < 4 )
+ ++offset;
+
+ QString line = QString::fromUtf8( convertedLine + offset );
+ rawText.append( line );
+ }
+ iconv_close( iconvHandle );
+ delete[] convertedLine;
+
+ if ( !encodingOk )
+ {
+ qDebug( "Decoding failed, cannot load file. Please fix encoding manually." );
+ m_mutex.unlock();
+ return NULL;
+ }
+
+ /** Cleaning up code comming from DBLP */
+ rawText = rawText.replace( htmlRegExp, "" );
+ rawText = EncoderLaTeX::currentEncoderLaTeX() ->decode( rawText );
+ unescapeLaTeXChars( rawText );
+ m_textStream = new QTextStream( rawText, IO_ReadOnly );
+ m_textStream->setEncoding( QTextStream::UnicodeUTF8 );
+ m_currentLineNumber = 0;
+ m_posIntCurrentLine = 0;
+ m_currentLine = "";
+
+ File *result = new File();
+ QIODevice *streamDevice = m_textStream->device();
+ while ( !cancelFlag && !m_textStream->atEnd() )
+ {
+ emit progress( streamDevice->at(), streamDevice->size() );
+ qApp->processEvents();
+ Element * element = nextElement();
+ if ( element != NULL )
+ {
+ Comment *comment = dynamic_cast<Comment*>( element );
+ if ( !m_ignoreComments || comment == NULL )
+ result->appendElement( element );
+ else
+ delete element;
+ }
+ qApp->processEvents();
+ }
+ emit progress( streamDevice->size(), streamDevice->size() );
+
+ if ( cancelFlag )
+ {
+ qDebug( "Loading file has been canceled" );
+ delete result;
+ result = NULL;
+ }
+
+ delete m_textStream;
+
+ m_mutex.unlock();
+ return result;
+ }
+
+ bool FileImporterBibTeX::guessCanDecode( const QString & rawText )
+ {
+ QString text = EncoderLaTeX::currentEncoderLaTeX() ->decode( rawText );
+ return text.find( QRegExp( "@\\w+\\{.+\\}" ) ) >= 0;
+ }
+
+ void FileImporterBibTeX::setIgnoreComments( bool ignoreComments )
+ {
+ m_ignoreComments = ignoreComments;
+ }
+
+ void FileImporterBibTeX::cancel()
+ {
+ cancelFlag = TRUE;
+ }
+
+ Element *FileImporterBibTeX::nextElement()
+ {
+ Token token = nextToken();
+
+ if ( token == tAt )
+ {
+ QString elementType = readSimpleString();
+ if ( elementType.lower() == "comment" )
+ return readCommentElement();
+ else if ( elementType.lower() == "string" )
+ return readMacroElement();
+ else if ( elementType.lower() == "preamble" )
+ return readPreambleElement();
+ else if ( !elementType.isEmpty() )
+ return readEntryElement( elementType );
+ else
+ {
+ qDebug( "ElementType is empty" );
+ return NULL;
+ }
+ }
+ else if ( token == tUnknown )
+ {
+ qDebug( "Unknown token near line %i, treating as comment", m_currentLineNumber );
+ return readPlainCommentElement();
+ }
+
+ if ( token != tEOF )
+ qDebug( "Don't know how to parse next token near line %i: %s", m_currentLineNumber, tokenidToString( token ).latin1() );
+
+ return NULL;
+ }
+
+ Comment *FileImporterBibTeX::readCommentElement()
+ {
+ while ( m_currentChar != '{' && m_currentChar != '(' && !m_textStream->atEnd() )
+ m_currentChar = nextChar();
+
+ return new Comment( readBracketString( m_currentChar ), TRUE );
+ }
+
+ Comment *FileImporterBibTeX::readPlainCommentElement()
+ {
+ QString result = m_currentChar;
+ result += readLine();
+ m_currentChar = nextChar();
+ while ( !m_textStream->atEnd() && m_currentChar != '@' && !m_currentChar.isSpace() )
+ {
+ result.append( '\n' ).append( m_currentChar );
+ m_currentChar = nextChar();
+ result.append( readLine() );
+ m_currentChar = nextChar();
+ }
+ return new Comment( result, FALSE );
+ }
+
+ Macro *FileImporterBibTeX::readMacroElement()
+ {
+ Token token = nextToken();
+ while ( token != tBracketOpen )
+ {
+ if ( token == tEOF )
+ {
+ qDebug( "Error in parsing unknown macro (near line %i): Opening curly brace ({) expected", m_currentLineNumber );
+ return NULL;
+ }
+ token = nextToken();
+ }
+
+ QString key = readSimpleString();
+ if ( nextToken() != tAssign )
+ {
+ qDebug( "Error in parsing macro '%s' (near line %i): Assign symbol (=) expected", key.latin1(), m_currentLineNumber );
+ return NULL;
+ }
+
+ Macro *macro = new Macro( key );
+ do
+ {
+ bool isStringKey = FALSE;
+ QString text = readString( isStringKey ).replace( QRegExp( "\\s+" ), " " );
+ if ( isStringKey )
+ macro->value()->items.append( new MacroKey( text ) );
+ else
+ macro->value()->items.append( new BibTeX::PlainText( text ) );
+
+ token = nextToken();
+ }
+ while ( token == tDoublecross );
+
+ return macro;
+ }
+
+ Preamble *FileImporterBibTeX::readPreambleElement()
+ {
+ Token token = nextToken();
+ while ( token != tBracketOpen )
+ {
+ if ( token == tEOF )
+ {
+ qDebug( "Error in parsing unknown preamble (near line %i): Opening curly brace ({) expected", m_currentLineNumber );
+ return NULL;
+ }
+ token = nextToken();
+ }
+
+ Preamble *preamble = new Preamble( );
+ do
+ {
+ bool isStringKey = FALSE;
+ QString text = readString( isStringKey ).replace( QRegExp( "\\s+" ), " " );
+ if ( isStringKey )
+ preamble->value()->items.append( new MacroKey( text ) );
+ else
+ preamble->value()->items.append( new BibTeX::PlainText( text ) );
+
+ token = nextToken();
+ }
+ while ( token == tDoublecross );
+
+ return preamble;
+ }
+
+ Entry *FileImporterBibTeX::readEntryElement( const QString& typeString )
+ {
+ Token token = nextToken();
+ while ( token != tBracketOpen )
+ {
+ if ( token == tEOF )
+ {
+ qDebug( "Error in parsing unknown entry (near line %i): Opening curly brace ({) expected", m_currentLineNumber );
+ return NULL;
+ }
+ token = nextToken();
+ }
+
+ QString key = readSimpleString();
+ Entry *entry = new Entry( typeString, key );
+
+ token = nextToken();
+ do
+ {
+ if ( token == tBracketClose || token == tEOF )
+ break;
+ else if ( token != tComma )
+ {
+ qDebug( "Error in parsing entry '%s' (near line %i): Comma symbol (,) expected but got 0x%x (token %s)", key.latin1(), m_currentLineNumber, m_currentChar.unicode(), tokenidToString( token ).latin1() );
+ delete entry;
+ return NULL;
+ }
+
+ QString fieldTypeName = readSimpleString();
+ token = nextToken();
+ if ( fieldTypeName == QString::null || token == tBracketClose )
+ {
+ // entry is buggy, but we still accept it
+ break;
+ }
+ else if ( token != tAssign )
+ {
+ qDebug( "Error in parsing entry '%s' (near line %i): Assign symbol (=) expected after field name '%s'", key.latin1(), m_currentLineNumber, fieldTypeName.latin1() );
+ delete entry;
+ return NULL;
+ }
+
+ /** check for duplicate fields */
+ if ( entry->getField( fieldTypeName ) != NULL )
+ {
+ int i = 1;
+ QString appendix = QString::number( i );
+ while ( entry->getField( fieldTypeName + appendix ) != NULL )
+ {
+ ++i;
+ appendix = QString::number( i );
+ }
+ fieldTypeName += appendix;
+ }
+
+ EntryField *entryField = new EntryField( fieldTypeName );
+
+ token = readValue( entryField->value(), entryField->fieldType() );
+
+ entry->addField( entryField );
+ }
+ while ( TRUE );
+
+ return entry;
+ }
+
+ FileImporterBibTeX::Token FileImporterBibTeX::nextToken()
+ {
+ if ( m_textStream->atEnd() )
+ return tEOF;
+
+ Token curToken = tUnknown;
+
+ while (( m_currentChar.isSpace() || m_currentChar == '\t' ) && !m_textStream->atEnd() )
+ m_currentChar = nextChar();
+
+ switch ( m_currentChar.latin1() )
+ {
+ case '@':
+ curToken = tAt;
+ break;
+ case '{':
+ case '(':
+ curToken = tBracketOpen;
+ break;
+ case '}':
+ case ')':
+ curToken = tBracketClose;
+ break;
+ case ',':
+ curToken = tComma;
+ break;
+ case '=':
+ curToken = tAssign;
+ break;
+ case '#':
+ curToken = tDoublecross;
+ break;
+ default:
+ if ( m_textStream->atEnd() )
+ curToken = tEOF;
+ }
+
+ if ( curToken != tUnknown && curToken != tEOF )
+ m_currentChar = nextChar();
+
+ return curToken;
+ }
+
+ QString FileImporterBibTeX::readString( bool &isStringKey )
+ {
+ while ( m_currentChar.isSpace() )
+ m_currentChar = nextChar();
+
+ isStringKey = FALSE;
+ switch ( m_currentChar.latin1() )
+ {
+ case '{':
+ case '(':
+ return readBracketString( m_currentChar );
+ case '"':
+ return readQuotedString();
+ default:
+ isStringKey = TRUE;
+ return readSimpleString();
+ }
+ }
+
+ QString FileImporterBibTeX::readSimpleString( QChar until )
+ {
+ QString result;
+
+ while ( m_currentChar.isSpace() )
+ m_currentChar = nextChar();
+
+ if ( m_currentChar.isLetterOrNumber() || extraAlphaNumChars.contains( m_currentChar ) )
+ {
+ result.append( m_currentChar );
+ m_currentChar = nextChar();
+ }
+
+ while ( !m_textStream->atEnd() )
+ {
+ if ( until != '\0' )
+ {
+ if ( m_currentChar != until )
+ result.append( m_currentChar );
+ else
+ break;
+ }
+ else
+ if ( m_currentChar.isLetterOrNumber() || extraAlphaNumChars.contains( m_currentChar ) )
+ result.append( m_currentChar );
+ else if ( m_currentChar == "," || m_currentChar == "(" || m_currentChar == ")" || m_currentChar == "{" || m_currentChar == "}" || m_currentChar == "=" || m_currentChar == "#" || m_currentChar == "@" || m_currentChar.isSpace() )
+ break;
+ else
+ {
+ qDebug( "Unknown letter or number: 0x%x", m_currentChar.unicode() );
+ // break;
+ }
+ m_currentChar = nextChar();
+ }
+ return result;
+ }
+
+ QString FileImporterBibTeX::readQuotedString()
+ {
+ QString result;
+ QChar lastChar = m_currentChar;
+ m_currentChar = nextChar();
+ while ( !m_textStream->atEnd() )
+ {
+ if ( m_currentChar != '"' || lastChar == '\\' )
+ result.append( m_currentChar );
+ else
+ break;
+ lastChar = m_currentChar;
+ m_currentChar = nextChar();
+ }
+
+ /** read character after closing " */
+ m_currentChar = nextChar();
+
+ return result;
+ }
+
+ QString FileImporterBibTeX::readLine()
+ {
+ QString result = m_currentLine.mid( m_posIntCurrentLine );
+ m_posIntCurrentLine = m_currentLine.length() + 2;
+ return result;
+ }
+
+ QString FileImporterBibTeX::readBracketString( const QChar openingBracket )
+ {
+ QString result;
+ QChar closingBracket = '}';
+ if ( openingBracket == '(' )
+ closingBracket = ')';
+ int counter = 1;
+ m_currentChar = nextChar();
+ while ( !m_textStream->atEnd() )
+ {
+ if ( m_currentChar == openingBracket )
+ counter++;
+ else if ( m_currentChar == closingBracket )
+ counter--;
+
+ if ( counter == 0 )
+ break;
+ else
+ result.append( m_currentChar );
+ m_currentChar = nextChar();
+ }
+ m_currentChar = nextChar();
+ return result;
+ }
+
+ FileImporterBibTeX::Token FileImporterBibTeX::readValue( Value *value, EntryField::FieldType fieldType )
+ {
+ Token token = tUnknown;
+
+ do
+ {
+ bool isStringKey = FALSE;
+ QString text = readString( isStringKey ).replace( QRegExp( "\\s+" ), " " );
+
+ switch ( fieldType )
+ {
+ case EntryField::ftKeywords:
+ {
+ if ( isStringKey )
+ qDebug( "WARNING: Cannot handle keywords that are macros" );
+ else
+ value->items.append( new KeywordContainer( text ) );
+ }
+ break;
+ case EntryField::ftAuthor:
+ case EntryField::ftEditor:
+ {
+ if ( isStringKey )
+ qDebug( "WARNING: Cannot handle authors/editors that are macros" );
+ else
+ {
+ QStringList persons;
+ splitPersons( text, persons );
+ PersonContainer *container = new PersonContainer( m_personFirstNameFirst );
+ for ( QStringList::ConstIterator pit = persons.constBegin(); pit != persons.constEnd(); ++pit )
+ container->persons.append( new Person( *pit, m_personFirstNameFirst ) );
+ value->items.append( container );
+ }
+ }
+ break;
+ case EntryField::ftPages:
+ text.replace( QRegExp( "\\s*--?\\s*" ), QChar( 0x2013 ) );
+ default:
+ {
+ if ( isStringKey )
+ value->items.append( new MacroKey( text ) );
+ else
+ value->items.append( new BibTeX::PlainText( text ) );
+ }
+ }
+
+ token = nextToken();
+ }
+ while ( token == tDoublecross );
+
+ return token;
+ }
+
+ void FileImporterBibTeX::unescapeLaTeXChars( QString &text )
+ {
+ text.replace( "\\&", "&" );
+ }
+
+ void FileImporterBibTeX::splitPersons( const QString& text, QStringList &persons )
+ {
+ QStringList wordList;
+ QString word;
+ int bracketCounter = 0;
+
+ for ( unsigned int pos = 0;pos < text.length();++pos )
+ {
+ if ( text[pos] == '{' )
+ ++bracketCounter;
+ else if ( text[pos] == '}' )
+ --bracketCounter;
+
+ if ( text[pos] == ' ' || text[pos] == '\n' || text[pos] == '\r' )
+ {
+ if ( word == "and" && bracketCounter == 0 )
+ {
+ persons.append( wordList.join( " " ) );
+ wordList.clear();
+ }
+ else if ( !word.isEmpty() )
+ wordList.append( word );
+
+ word = "";
+ }
+ else
+ word.append( text[pos] );
+ }
+
+ wordList.append( word );
+ persons.append( wordList.join( " " ) );
+ }
+
+ void FileImporterBibTeX::evaluateParameterComments( iconv_t &iconvHandle, const char *cline )
+ {
+ /** simple preliminary checks before expensive conversion to QString */
+ if ( cline[0] == '@' && cline[1] == 'c' )
+ {
+ QString line = QString( cline ).lower();
+ /** check if this file requests a special encoding */
+ if ( line.startsWith( "@comment{x-kbibtex-encoding=" ) && line.endsWith( "}\n" ) )
+ {
+ QString newEncoding = line.mid( 28, line.length() - 30 );
+ qDebug( "x-kbibtex-encoding=<%s>", newEncoding.latin1() );
+ if ( newEncoding == "latex" ) newEncoding = "utf-8";
+ iconv_close( iconvHandle );
+ iconvHandle = iconv_open( "utf-8", newEncoding.append( '\0' ).ascii() );
+ }
+ }
+ }
+
+ QChar FileImporterBibTeX::nextChar()
+ {
+ bool atEndOfLine = m_posIntCurrentLine >= m_currentLine.length();
+
+ while (( m_posIntCurrentLine >= m_currentLine.length() || m_currentLine.isEmpty() || m_currentLine.isNull() ) && !m_textStream->atEnd() )
+ {
+ m_currentLine = m_textStream->readLine();
+ m_posIntCurrentLine = 0;
+ ++m_currentLineNumber;
+ }
+
+ if ( atEndOfLine )
+ return QChar( ' ' );
+ else if ( m_posIntCurrentLine < m_currentLine.length() )
+ {
+ QChar result = m_currentLine[m_posIntCurrentLine];
+ ++m_posIntCurrentLine;
+ return result;
+ }
+
+ return QChar();
+ }
+
+ QString FileImporterBibTeX::tokenidToString( Token token )
+ {
+ switch ( token )
+ {
+ case tAt: return QString( "At" );
+ case tBracketClose: return QString( "BracketClose" );
+ case tBracketOpen: return QString( "BracketOpen" );
+ case tAlphaNumText: return QString( "AlphaNumText" );
+ case tAssign: return QString( "Assign" );
+ case tComma: return QString( "Comma" );
+ case tDoublecross: return QString( "Doublecross" );
+ case tEOF: return QString( "EOF" );
+ case tUnknown: return QString( "Unknown" );
+ default: return QString( "<Unknown>" );
+ }
+ }
+}