/*************************************************************************** * Copyright (C) 2004-2009 by Thomas Fischer * * fischer@unix-ag.uni-kl.de * * * * This program is free software; you can redistribute it and/or modify * * it under the terms of the GNU General Public License as published by * * the Free Software Foundation; either version 2 of the License, or * * (at your option) any later version. * * * * This program is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * * GNU General Public License for more details. * * * * You should have received a copy of the GNU General Public License * * along with this program; if not, write to the * * Free Software Foundation, Inc., * * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * ***************************************************************************/ #include #include #include #include #include #include #include #include "fileimporterbibtex.h" #include "encoderxml.h" #include "settings.h" #include "webqueryarxiv.h" namespace KBibTeX { WebQueryArXivWidget::WebQueryArXivWidget( TQWidget *parent, const char *name ) : WebQueryWidget( parent, name ) { init(); Settings *settings = Settings::self(); TQString value = settings->getWebQueryDefault( "ArXiv" ); value = value == TQString::null ? "" : value; lineEditQuery->setText( value ); slotTextChanged( value, true ); } WebQueryArXiv::WebQueryArXiv( TQWidget* parent ) : WebQuery( parent ), m_arXivServer( "www.arxiv.org" ), /** examples: Journal of Inefficient Algorithms 5 (2003) 35-39 Astrophys.J. 578 (2002) L103-L106 New J. Phys. 10 (2008) 033023 Physics Letters A 297 (2002) 4-8 Appl.Phys. B75 (2002) 655-665 JHEP 0611 (2006) 045 */ m_jourRef1( "^([a-zA-Z. ]+[a-zA-Z.])\\s*(\\d+)\\s+\\((\\d{4})\\)\\s+([0-9A-Z]+)(-([0-9A-Z]+))?$" ), /** examples: Journal of Inefficient Algorithms, Vol. 93, No. 2 (2009), pp. 42-51 International Journal of Quantum Information, Vol. 1, No. 4 (2003) 427-441 Stud. Hist. Phil. Mod. Phys., Vol 33 no 3 (2003), pp. 441-468 */ m_jourRef2( "^([a-zA-Z. ]+[a-zA-Z.]),\\s+Vol\\.?\\s+(\\d+)[,]?\\s+No\\.?\\s+(\\d+)\\s+\\((\\d{4})\\)[,]?\\s+(pp\\.\\s+)?(\\d+)(-(\\d+))?$" ), /** examples: Journal of Inefficient Algorithms, volume 4, number 1, pp. 12-21, 2008 Scientometrics, volume 69, number 3, pp. 669-687, 2006 */ m_jourRef3( "^([a-zA-Z. ]+),\\s+volume\\s+(\\d+),\\s+number\\s+(\\d+),\\s+pp\\.\\s+(\\d+)(-(\\d+))?,\\s+(\\d{4})$" ), /** examples: Journal of Inefficient Algorithms 4(1): 101-122, 2010 JHEP0809:131,2008 Phys.Rev.D78:013004,2008 Lect.NotesPhys.690:107-127,2006 Europhys. Letters 70:1-7 (2005) Journal of Conflict Resolution 51(1): 58 - 88 (2007) Journal of Artificial Intelligence Research (JAIR), 9:247-293 */ m_jourRef4( "^([a-zA-Z. ()]+)[,]?\\s*(\\d+)(\\((\\d+)\\))?:\\s*(\\d+)(\\s*-\\s*(\\d+))?(,\\s*(\\d{4})|\\s+\\((\\d{4})\\))?$" ), /** examples: Journal of Inefficient Algorithms vol. 31, 4 2000 Phys. Rev. A 71, 032339 (2005) Phys. Rev. Lett. 91, 027901 (2003) Phys. Rev. A 78, 013620 (2008) Phys. Rev. E 62, 1842 (2000) Rev. Mod. Phys. 79, 555 (2007) J. Math. Phys. 49, 032105 (2008) New J. Phys. 8, 58 (2006) Phys. Rev. Lett. 91, 217905 (2003). Physical Review B vol. 66, 161320(R) (2002) ??? Phys. Rev. Lett. 89, 057902(1--4) (2002). ??? J. Mod. Opt., 54, 2211 (2007) */ m_jourRef5( "^([a-zA-Z. ]+)\\s+(vol\\.\\s+)?(\\d+),\\s+(\\d+)(\\([A-Z]+\\))?\\s+\\((\\d{4})\\)[.]?$" ), /** examples: Journal of Inefficient Algorithms, 11(2) (1999) 42-55 Learned Publishing, 20(1) (January 2007) 16-22 */ m_jourRef6( "^([a-zA-Z. ]+),\\s+(\\d+)\\((\\d+)\\)\\s+(\\(([A-Za-z]+\\s+)?(\\d{4})\\))?\\s+(\\d+)(-(\\d+))?$" ), m_reJour( "^([a-zA-Z. ]+)" ), m_reYear( "\\b((18|19|20)\\d{2})\\b" ), m_rePages( "\\b([1-9]\\d{0,2})\\s*[-]+\\s*([1-9]\\d{0,2})\\b" ) { m_importer = new BibTeX::FileImporterBibTeX( FALSE ); m_importer->setIgnoreComments( TRUE ); m_widget = new WebQueryArXivWidget( parent ); } WebQueryArXiv::~WebQueryArXiv() { delete m_widget; delete m_importer; } TQString WebQueryArXiv::title() { return i18n( "arXiv" ); } TQString WebQueryArXiv::disclaimer() { return i18n( "arXiv is an archive for preprints" ); } TQString WebQueryArXiv::disclaimerURL() { return "http://www.arxiv.org/"; } WebQueryWidget *WebQueryArXiv::widget() { return m_widget; } void WebQueryArXiv::cancelQuery() { m_urls.clear(); // FIXME: The following code crashes KBibTeX: // if ( m_currentJob != NULL ) m_currentJob->kill( FALSE ); } void WebQueryArXiv::query() { WebQuery::query(); Settings *settings = Settings::self(); settings->setWebQueryDefault( "ArXiv", m_widget->lineEditQuery->text() ); m_urls.clear(); m_numberOfResults = m_widget->spinBoxMaxHits->value(); setNumStages( m_numberOfResults + 1 ); TQString searchTerm = m_widget->lineEditQuery->text().stripWhiteSpace().replace( '$', "" ); TQStringList queryWords = TQStringList::split( TQRegExp( "\\s+" ), searchTerm ); if ( searchTerm.isEmpty() || queryWords.size() == 0 ) { setEndSearch( WebQuery::statusInvalidQuery ); return; } TQString query; for ( unsigned int i = 0; i < queryWords.size() - 1; ++i ) query = query.append( "AND " ).append( queryWords[i] ).append( " " ); query.append( queryWords[queryWords.size()-1] ); KURL url = KURL( TQString( "http://www.arxiv.org/find/all/1/all:+%2/0/1/0/all/0/1?per_page=%1" ).arg( m_numberOfResults ).arg( query.replace( "%", "%25" ).replace( "+", "%2B" ).replace( " ", "%20" ).replace( "#", "%23" ).replace( "&", "%26" ).replace( "?", "%3F" ) ) ); m_currentJobMutex.lock(); TDEIO::Job *job = TDEIO::storedGet( url, FALSE, FALSE ); connect( job, SIGNAL( result( TDEIO::Job * ) ), this, SLOT( unlockJob( TDEIO::Job * ) ) ); connect( job, SIGNAL( result( TDEIO::Job * ) ), this, SLOT( arXivResult( TDEIO::Job * ) ) ); } void WebQueryArXiv::unlockJob( TDEIO::Job * ) { m_currentJobMutex.unlock(); } void WebQueryArXiv::arXivResult( TDEIO::Job *job ) { if ( job->error() == 0 && !m_aborted ) { enterNextStage(); TQBuffer data; data.open( IO_WriteOnly ); data.writeBlock( dynamic_cast( job )->data() ); data.close(); data.open( IO_ReadOnly ); TQTextStream ts( &data ); TQString result = ts.read(); data.close(); int p = -1; m_totalHits = 0; m_receivedHits = 0; while ( !m_aborted && ( int ) m_totalHits < m_numberOfResults && ( p = result.find( "arXiv:", p + 1 ) ) >= 0 ) { int p2 = result.find( "<", p + 2 ); TQString hit = result.mid( p + 6, p2 - p - 6 ); ++m_totalHits; p = p2 + 1; KURL url = KURL( TQString( "http://%2/abs/%1" ).arg( hit ).arg( m_arXivServer ) ); m_urls.append( url ); } if ( m_totalHits == 0 ) setEndSearch( WebQuery::statusSuccess ); else if ( !m_urls.isEmpty() ) { KURL url = m_urls.first(); m_urls.remove( url ); fetchFromAbstract( url ); } } else setEndSearch( WebQuery::statusError ); } void WebQueryArXiv::fetchFromAbstract( const KURL &abstractURL ) { m_aborted = false; m_currentJobMutex.lock(); TDEIO::Job *job = TDEIO::storedGet( abstractURL, FALSE, FALSE ); connect( job, SIGNAL( result( TDEIO::Job * ) ), this, SLOT( unlockJob( TDEIO::Job * ) ) ); connect( job, SIGNAL( result( TDEIO::Job * ) ), this, SLOT( arXivAbstractResult( TDEIO::Job * ) ) ); } void WebQueryArXiv::arXivAbstractResult( TDEIO::Job *job ) { if ( job->error() == 0 && !m_aborted ) { ++m_receivedHits; enterNextStage(); TQBuffer data; data.open( IO_WriteOnly ); data.writeBlock( dynamic_cast( job )->data() ); data.close(); data.open( IO_ReadOnly ); TQTextStream ts( &data ); TQString result = BibTeX::EncoderXML::currentEncoderXML()->decode( ts.read() ); data.close(); /** find id */ int p = result.find( "arXiv:", 0 ); if ( p < 0 ) return; int p2 = result.find( "<", p + 2 ); if ( p2 < 0 ) return; TQString id = result.mid( p + 6, p2 - p - 6 ); /** find cite_as */ TQString citeas = ""; p = result.find( "Cite as", 0 ); p = result.find( ">arXiv:", p ); p2 = result.find( "= 0 && p2 >= 0 ) citeas = result.mid( p + 7, p2 - p - 7 ); BibTeX::Entry * entry = new BibTeX::Entry( BibTeX::Entry::etMisc, citeas.isEmpty() ? ( id.isEmpty() ? TQString( "arXiv" ).append( m_receivedHits ) : id ) : citeas ); /** find abstract */ p = result.find( "Abstract:", 0 ); if ( p < 0 ) return; p2 = result.find( "]+>" ), "" ); BibTeX::EntryField * field = new BibTeX::EntryField( BibTeX::EntryField::ftAbstract ); entry->addField( field ); field->setValue( new BibTeX::Value( abstract ) ); /** find authors */ BibTeX::PersonContainer *personContainer = new BibTeX::PersonContainer( Settings::self()->editing_FirstNameFirst ); p = -1; while (( p = result.find( "/au:", p + 1 ) ) > 0 ) { p = result.find( ">", p + 1 ); p2 = result.find( "<", p + 1 ); TQString author = result.mid( p + 1, p2 - p - 1 ); personContainer->persons.append( new BibTeX::Person( author ) ); } if ( personContainer->persons.isEmpty() ) delete personContainer; else { BibTeX::EntryField * field = new BibTeX::EntryField( BibTeX::EntryField::ftAuthor ); entry->addField( field ); BibTeX::Value *value = new BibTeX::Value(); value->items.append( personContainer ); field->setValue( value ); } /** find title */ p = result.find( "Title:", 0 ); p2 = result.find( "<", p + 10 ); if ( p >= 0 && p2 >= 0 ) { TQString title = result.mid( p + 13, p2 - p - 13 ).replace( TQRegExp( "\\s+" ), " " ).replace( TQRegExp( "^\\s+|\\s+$" ), "" ); BibTeX::EntryField * field = new BibTeX::EntryField( BibTeX::EntryField::ftTitle ); entry->addField( field ); field->setValue( new BibTeX::Value( title ) ); } /** find month and year */ p = result.find( "Submitted on", 0 ); while (( p2 = result.find( "last revised", p + 1 ) ) >= 0 ) p = p2; p2 = result.find( TQRegExp( "\\d\\d\\d\\d" ), p ); bool ok = FALSE; int year = result.mid( p2, 4 ).toInt( &ok ); if ( !ok ) year = 0; if ( year > 1000 ) { BibTeX::EntryField * field = new BibTeX::EntryField( BibTeX::EntryField::ftYear ); entry->addField( field ); field->setValue( new BibTeX::Value( TQString::number( year ) ) ); } p2 = result.find( TQRegExp( "\\b[A-Z][a-z]{2}\\b" ), p ); if ( p2 >= 0 ) { TQString month = result.mid( p2, 3 ).lower(); BibTeX::EntryField * field = new BibTeX::EntryField( BibTeX::EntryField::ftMonth ); entry->addField( field ); BibTeX::Value *value = new BibTeX::Value(); value->items.append( new BibTeX::MacroKey( month ) ); field->setValue( value ); } /** find DOI */ p = result.find( "http://dx.doi.org/", 0 ); p2 = result.find( "\"", p + 1 ); if ( p >= 0 && p2 >= 0 ) { TQString doi = result.mid( p, p2 - p ); BibTeX::EntryField * field = new BibTeX::EntryField( BibTeX::EntryField::ftDoi ); entry->addField( field ); field->setValue( new BibTeX::Value( doi ) ); } /** find keywords */ p = result.find( "", 0 ); p2 = result.find( "", p + 1 ); if ( p >= 0 && p2 >= 0 ) { TQString keywords = result.mid( p + 31, p2 - p - 31 ).replace( TQRegExp( "]*>" ), "" ); BibTeX::EntryField * field = new BibTeX::EntryField( BibTeX::EntryField::ftKeywords ); entry->addField( field ); BibTeX::Value *value = new BibTeX::Value(); value->items.append( new BibTeX::KeywordContainer( keywords ) ); field->setValue( value ); } /** find ACM classes */ p = result.find( "", 0 ); p2 = result.find( "", p + 1 ); if ( p >= 0 && p2 >= 0 ) { TQString acmclasses = result.mid( p + 34, p2 - p - 34 ); BibTeX::EntryField * field = new BibTeX::EntryField( "acm-classes" ); entry->addField( field ); field->setValue( new BibTeX::Value( acmclasses ) ); } /** find versions */ for ( int v = 1; !m_aborted && v < 20; ++v ) { p = result.find( TQString( ">[v%1]<" ).arg( v ), 0 ); if ( p < 0 ) break; int p3 = result.findRev( "href=\"", p ); if ( p3 >= 0 && p3 > p - 40 ) { p2 = result.find( "\">", p3 ); if ( p2 >= 0 ) { TQString url = result.mid( p3 + 6, p2 - p3 - 6 ); BibTeX::EntryField * field = new BibTeX::EntryField( TQString( "v%1url" ).arg( v ) ); entry->addField( field ); field->setValue( new BibTeX::Value( TQString( "http://www.arxiv.org" ).append( url ) ) ); } } p = result.find( "", p + 1 ); p2 = result.find( "= 0 && p2 >= 0 ) { TQString version = result.mid( p + 5, p2 - p - 5 ); BibTeX::EntryField * field = new BibTeX::EntryField( TQString( "v%1descr" ).arg( v ) ); entry->addField( field ); field->setValue( new BibTeX::Value( version ) ); } } /** find tech report reference */ p = result.find( "", 0 ); p2 = result.find( "", p + 1 ); if ( p >= 0 && p2 >= 0 ) { TQString techRepNr = result.mid( p + 36, p2 - p - 36 ); BibTeX::EntryField * field = new BibTeX::EntryField( BibTeX::EntryField::ftNumber ); entry->addField( field ); field->setValue( new BibTeX::Value( techRepNr ) ); entry->setEntryType( BibTeX::Entry::etTechReport ); } /** find journal reference */ p = result.find( "", 0 ); p2 = result.find( "", p + 1 ); if ( p >= 0 && p2 >= 0 ) { TQString jref = result.mid( p + 27, p2 - p - 27 ); jref.replace( "\n", " " ); TQString jTitle = ""; TQString jVol = ""; TQString jNo = ""; TQString jYear = ""; TQString jPage1 = ""; TQString jPage2 = ""; // m_jourRef1( "^([a-zA-Z. ]+[a-zA-Z.])\\s*(\\d+)\\s+\\((\\d{4})\\)\\s+([0-9A-Z]+)(-([0-9A-Z]+))?$" ) if ( m_jourRef1.search( jref ) == 0 ) { jTitle = m_jourRef1.cap( 1 ); jVol = m_jourRef1.cap( 2 ); jYear = m_jourRef1.cap( 3 ); jPage1 = m_jourRef1.cap( 4 ); jPage2 = m_jourRef1.cap( 6 ); } // m_jourRef2( "^([a-zA-Z. ]+[a-zA-Z.]),\s+Vol[.]?\s+(\d+)[,]?\s+No[.]?\s+(\d+)\s+\((\d{4})\)[,]?\s+(pp\.\s+)?(\d+)(-(\d+))?$" ) else if ( m_jourRef2.search( jref ) == 0 ) { jTitle = m_jourRef2.cap( 1 ); jVol = m_jourRef2.cap( 2 ); jNo = m_jourRef2.cap( 3 ); jYear = m_jourRef2.cap( 4 ); jPage1 = m_jourRef2.cap( 6 ); jPage2 = m_jourRef2.cap( 8 ); } // m_jourRef3( "^([a-zA-Z. ]+),\\s+volume\\s+(\\d+),\\s+number\\s+(\\d+),\\s+pp\\.\\s+(\\d+)(-(\\d+))?,\\s+(\\d{4})$" ) else if ( m_jourRef3.search( jref ) == 0 ) { jTitle = m_jourRef3.cap( 1 ); jVol = m_jourRef3.cap( 2 ); jNo = m_jourRef3.cap( 3 ); jPage1 = m_jourRef3.cap( 4 ); jPage2 = m_jourRef3.cap( 6 ); jYear = m_jourRef3.cap( 7 ); } // m_jourRef4("^([a-zA-Z. ()]+[a-zA-Z.()])[,]?\\s*(\\d+)(\\((\\d+)\\))?:\\s*(\\d+)(\\s*-\\s*(\\d+))?(,\\s*(\\d{4})|\\s+\\((\\d{4})\\))?$") else if ( m_jourRef4.search( jref ) == 0 ) { jTitle = m_jourRef4.cap( 1 ); jVol = m_jourRef4.cap( 2 ); jNo = m_jourRef4.cap( 4 ); jPage1 = m_jourRef4.cap( 5 ); jPage2 = m_jourRef4.cap( 7 ); jYear = m_jourRef4.cap( 9 ).append( m_jourRef4.cap( 10 ) ); } // m_jourRef5("^([a-zA-Z. ]+)\\s+(vol\\.\\s+)?(\\d+),\\s+(\\d+)(\\([A-Z]+\\))?\\s+\\((\\d{4})\\)[.]?$") else if ( m_jourRef5.search( jref ) == 0 ) { jTitle = m_jourRef5.cap( 1 ); jVol = m_jourRef5.cap( 3 ); jPage1 = m_jourRef5.cap( 4 ); jYear = m_jourRef5.cap( 6 ); } // m_jourRef6("^([a-zA-Z. ]+),\\s+(\\d+)\\((\\d+)\\)\\s+(\\(([A-Za-z]+\\s+)?(\\d{4})\\))?\\s+(\\d+)(-(\\d+))?$") else if ( m_jourRef6.search( jref ) == 0 ) { jTitle = m_jourRef6.cap( 1 ); jVol = m_jourRef6.cap( 2 ); jNo = m_jourRef6.cap( 3 ); jYear = m_jourRef6.cap( 6 ); jPage1 = m_jourRef6.cap( 7 ); jPage2 = m_jourRef6.cap( 9 ); } else { if ( m_reJour.search( jref ) == 0 ) jTitle = m_reJour.cap( 1 ); if ( m_reYear.search( jref ) == 0 ) jYear = m_reYear.cap( 1 ); if ( m_rePages.search( jref ) > -1 ) { jPage1 = m_rePages.cap( 1 ); jPage2 = m_rePages.cap( 2 ); } } if ( !jTitle.isEmpty() ) { entry->deleteField( BibTeX::EntryField::ftJournal ); BibTeX::EntryField * field = new BibTeX::EntryField( BibTeX::EntryField::ftJournal ); entry->addField( field ); field->setValue( new BibTeX::Value( jTitle ) ); entry->setEntryType( BibTeX::Entry::etArticle ); } if ( !jVol.isEmpty() ) { entry->deleteField( BibTeX::EntryField::ftVolume ); BibTeX::EntryField * field = new BibTeX::EntryField( BibTeX::EntryField::ftVolume ); entry->addField( field ); field->setValue( new BibTeX::Value( jVol ) ); } if ( !jNo.isEmpty() ) { entry->deleteField( BibTeX::EntryField::ftNumber ); BibTeX::EntryField * field = new BibTeX::EntryField( BibTeX::EntryField::ftNumber ); entry->addField( field ); field->setValue( new BibTeX::Value( jNo ) ); } if ( !jYear.isEmpty() ) { entry->deleteField( BibTeX::EntryField::ftYear ); entry->deleteField( BibTeX::EntryField::ftMonth ); BibTeX::EntryField * field = new BibTeX::EntryField( BibTeX::EntryField::ftYear ); entry->addField( field ); field->setValue( new BibTeX::Value( jYear ) ); } if ( !jPage1.isEmpty() ) { entry->deleteField( BibTeX::EntryField::ftPages ); BibTeX::EntryField * field = new BibTeX::EntryField( BibTeX::EntryField::ftPages ); entry->addField( field ); TQString text = jPage1; if ( !jPage2.isEmpty() ) text.append( "--" ).append( jPage2 ); field->setValue( new BibTeX::Value( text ) ); } } if ( result.find( TQRegExp( "Ph\\.?D\\.? Thesis", FALSE ), 0 ) >= 0 ) entry->setEntryType( BibTeX::Entry::etPhDThesis ); field = new BibTeX::EntryField( BibTeX::EntryField::ftURL ); entry->addField( field ); field->setValue( new BibTeX::Value( TQString( "http://arxiv.org/abs/%1" ).arg( id ) ) ); field = new BibTeX::EntryField( "pdf" ); entry->addField( field ); field->setValue( new BibTeX::Value( TQString( "http://arxiv.org/pdf/%1" ).arg( id ) ) ); emit foundEntry( entry, false ); if ( m_totalHits == m_receivedHits ) setEndSearch( WebQuery::statusSuccess ); else if ( !m_urls.isEmpty() ) { KURL url = m_urls.first(); m_urls.remove( url ); m_currentJobMutex.lock(); TDEIO::Job *job = TDEIO::storedGet( url, FALSE, FALSE ); connect( job, SIGNAL( result( TDEIO::Job * ) ), this, SLOT( unlockJob( TDEIO::Job * ) ) ); connect( job, SIGNAL( result( TDEIO::Job * ) ), this, SLOT( arXivAbstractResult( TDEIO::Job * ) ) ); } else setEndSearch( WebQuery::statusSuccess ); } else setEndSearch( WebQuery::statusError ); } } #include "webqueryarxiv.moc"