//C- -*- C++ -*- //C- ------------------------------------------------------------------- //C- DjVuLibre-3.5 //C- Copyright (c) 2002 Leon Bottou and Yann Le Cun. //C- Copyright (c) 2001 AT&T //C- //C- This software is subject to, and may be distributed under, the //C- GNU General Public License, Version 2. The license should have //C- accompanied the software or you may obtain a copy of the license //C- from the Free Software Foundation at http://www.fsf.org . //C- //C- This program is distributed in the hope that it will be useful, //C- but WITHOUT ANY WARRANTY; without even the implied warranty of //C- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the //C- GNU General Public License for more details. //C- //C- DjVuLibre-3.5 is derived from the DjVu(r) Reference Library //C- distributed by Lizardtech Software. On July 19th 2002, Lizardtech //C- Software authorized us to replace the original DjVu(r) Reference //C- Library notice by the following text (see doc/lizard2002.djvu): //C- //C- ------------------------------------------------------------------ //C- | DjVu (r) Reference Library (v. 3.5) //C- | Copyright (c) 1999-2001 LizardTech, Inc. All Rights Reserved. //C- | The DjVu Reference Library is protected by U.S. Pat. No. //C- | 6,058,214 and patents pending. //C- | //C- | This software is subject to, and may be distributed under, the //C- | GNU General Public License, Version 2. The license should have //C- | accompanied the software or you may obtain a copy of the license //C- | from the Free Software Foundation at http://www.fsf.org . //C- | //C- | The computer code originally released by LizardTech under this //C- | license and unmodified by other parties is deemed "the LIZARDTECH //C- | ORIGINAL CODE." Subject to any third party intellectual property //C- | claims, LizardTech grants recipient a worldwide, royalty-free, //C- | non-exclusive license to make, use, sell, or otherwise dispose of //C- | the LIZARDTECH ORIGINAL CODE or of programs derived from the //C- | LIZARDTECH ORIGINAL CODE in compliance with the terms of the GNU //C- | General Public License. This grant only confers the right to //C- | infringe patent claims underlying the LIZARDTECH ORIGINAL CODE to //C- | the extent such infringement is reasonably necessary to enable //C- | recipient to make, have made, practice, sell, or otherwise dispose //C- | of the LIZARDTECH ORIGINAL CODE (or portions thereof) and not to //C- | any greater extent that may be necessary to utilize further //C- | modifications or combinations. //C- | //C- | The LIZARDTECH ORIGINAL CODE is provided "AS IS" WITHOUT WARRANTY //C- | OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED //C- | TO ANY WARRANTY OF NON-INFRINGEMENT, OR ANY IMPLIED WARRANTY OF //C- | MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. //C- +------------------------------------------------------------------ // // $Id: DjVuText.cpp,v 1.10 2004/07/07 19:23:36 leonb Exp $ // $Name: release_3_5_15 $ #ifdef HAVE_CONFIG_H # include "config.h" #endif #if NEED_GNUG_PRAGMAS # pragma implementation #endif #include "DjVuText.h" #include "IFFByteStream.h" #include "BSByteStream.h" #include "debug.h" #include #ifdef HAVE_NAMESPACES namespace DJVU { # ifdef NOT_DEFINED // Just to fool emacs c++ mode } #endif #endif #ifdef min #undef min #endif template static inline TYPE min(TYPE a,TYPE b) { return (a &gbs, const Zone * parent, const Zone * prev) const { ByteStream &bs=*gbs; // Encode type bs.write8(ztype); // Modify text_start and bounding rectangle based on the context // (whether there is a previous non-zero same-level-child or parent) int start=text_start; int x=rect.xmin, y=rect.ymin; int width=rect.width(), height=rect.height(); if (prev) { if (ztype==PAGE || ztype==PARAGRAPH || ztype==LINE) { // Encode offset from the lower left corner of the previous // child in the coord system in that corner with x to the // right and y down x=x-prev->rect.xmin; y=prev->rect.ymin-(y+height); } else // Either COLUMN or WORD or CHARACTER { // Encode offset from the lower right corner of the previous // child in the coord system in that corner with x to the // right and y up x=x-prev->rect.xmax; y=y-prev->rect.ymin; } start-=prev->text_start+prev->text_length; } else if (parent) { // Encode offset from the upper left corner of the parent // in the coord system in that corner with x to the right and y down x=x-parent->rect.xmin; y=parent->rect.ymax-(y+height); start-=parent->text_start; } // Encode rectangle bs.write16(0x8000+x); bs.write16(0x8000+y); bs.write16(0x8000+width); bs.write16(0x8000+height); // Encode text info bs.write16(0x8000+start); bs.write24(text_length); // Encode number of children bs.write24(children.size()); const Zone * prev_child=0; // Encode all children for (GPosition i=children; i; ++i) { children[i].encode(gbs, this, prev_child); prev_child=&children[i]; } } #endif void DjVuTXT::Zone::decode(const GP &gbs, int maxtext, const Zone * parent, const Zone * prev) { ByteStream &bs=*gbs; // Decode type ztype = (ZoneType) bs.read8(); if ( ztypeCHARACTER ) G_THROW( ERR_MSG("DjVuText.corrupt_text") ); // Decode coordinates int x=(int) bs.read16()-0x8000; int y=(int) bs.read16()-0x8000; int width=(int) bs.read16()-0x8000; int height=(int) bs.read16()-0x8000; // Decode text info text_start = (int) bs.read16()-0x8000; // int start=text_start; text_length = bs.read24(); if (prev) { if (ztype==PAGE || ztype==PARAGRAPH || ztype==LINE) { x=x+prev->rect.xmin; y=prev->rect.ymin-(y+height); } else // Either COLUMN or WORD or CHARACTER { x=x+prev->rect.xmax; y=y+prev->rect.ymin; } text_start+=prev->text_start+prev->text_length; } else if (parent) { x=x+parent->rect.xmin; y=parent->rect.ymax-(y+height); text_start+=parent->text_start; } rect=GRect(x, y, width, height); // Get children size int size = bs.read24(); // Checks if (rect.isempty() || text_start<0 || text_start+text_length>maxtext ) G_THROW( ERR_MSG("DjVuText.corrupt_text") ); // Process children const Zone * prev_child=0; children.empty(); while (size-- > 0) { Zone *z = append_child(); z->decode(gbs, maxtext, this, prev_child); prev_child=z; } } void DjVuTXT::normalize_text() { GUTF8String newtextUTF8; page_zone.normtext( (const char*)textUTF8, newtextUTF8 ); textUTF8 = newtextUTF8; } int DjVuTXT::has_valid_zones() const { if (!textUTF8) return false; if (page_zone.children.isempty() || page_zone.rect.isempty()) return false; return true; } #ifndef NEED_DECODER_ONLY void DjVuTXT::encode(const GP &gbs) const { ByteStream &bs=*gbs; if (! textUTF8 ) G_THROW( ERR_MSG("DjVuText.no_text") ); // Encode text int textsize = textUTF8.length(); bs.write24( textsize ); bs.writall( (void*)(const char*)textUTF8, textsize ); // Encode zones if (has_valid_zones()) { bs.write8(Zone::version); page_zone.encode(gbs); } } #endif void DjVuTXT::decode(const GP &gbs) { ByteStream &bs=*gbs; // Read text textUTF8.empty(); int textsize = bs.read24(); char *buffer = textUTF8.getbuf(textsize); int readsize = bs.read(buffer,textsize); buffer[readsize] = 0; if (readsize < textsize) G_THROW( ERR_MSG("DjVuText.corrupt_chunk") ); // Try reading zones unsigned char version; if ( bs.read( (void*) &version, 1 ) == 1) { if (version != Zone::version) G_THROW( ERR_MSG("DjVuText.bad_version") "\t" + GUTF8String(version) ); page_zone.decode(gbs, textsize); } } GP DjVuTXT::copy(void) const { return new DjVuTXT(*this); } static inline bool intersects_zone(GRect box, const GRect &zone) { return ((box.xmin < zone.xmin) ?(box.xmax >= zone.xmin) :(box.xmin <= zone.xmax)) &&((box.ymin < zone.ymin) ?(box.ymax >= zone.ymin) :(box.ymin <= zone.ymax)); } void DjVuTXT::Zone::get_text_with_rect(const GRect &box, int &string_start, int &string_end) const { GPosition pos=children; if(pos?box.contains(rect):intersects_zone(box,rect)) { const int text_end=text_start+text_length; if(string_start == string_end) { string_start=text_start; string_end=text_end; }else { if (string_end < text_end) string_end=text_end; if(text_start < string_start) string_start=text_start; } }else if(pos&&intersects_zone(box,rect)) { do { children[pos].get_text_with_rect(box,string_start,string_end); } while(++pos); } } void DjVuTXT::Zone::find_zones(GList &list, const int string_start, const int string_end) const { const int text_end=text_start+text_length; if(text_start >= string_start) { if(text_end <= string_end) { list.append(const_cast(this)); } else if(text_start < string_end) { if (children.size()) for (GPosition pos=children; pos; ++pos) children[pos].find_zones(list,string_start,string_end); else list.append(const_cast(this)); } } else if( text_end > string_start) { if (children.size()) for (GPosition pos=children; pos; ++pos) children[pos].find_zones(list,string_start,string_end); else list.append(const_cast(this)); } } void DjVuTXT::Zone::get_smallest(GList &list) const { GPosition pos=children; if(pos) { do { children[pos].get_smallest(list); } while (++pos); } else { list.append(rect); } } void DjVuTXT::Zone::get_smallest(GList &list, const int padding) const { GPosition pos=children; if(pos) { do { children[pos].get_smallest(list,padding); } while (++pos); } else if(zone_parent && zone_parent->ztype >= PARAGRAPH) { const GRect &xrect=zone_parent->rect; if(xrect.height() < xrect.width()) { list.append(GRect(rect.xmin-padding,xrect.ymin-padding,rect.width() +2*padding,xrect.height()+2*padding)); } else { list.append(GRect(xrect.xmin-padding,rect.ymin-padding,xrect.width() +2*padding,rect.height()+2*padding)); } } else { list.append(GRect(rect.xmin-padding,rect.ymin-padding,rect.width() +2*padding,rect.height()+2*padding)); } } void DjVuTXT::get_zones(int zone_type, const Zone *parent, GList & zone_list) const // get all the zones of type zone_type under zone node parent { // search all branches under parent const Zone *zone=parent; for( int cur_ztype=zone->ztype; cur_ztypechildren; pos; ++pos) { Zone *zcur=(Zone *)&zone->children[pos]; if ( zcur->ztype == zone_type ) { GPosition zpos=zone_list; if ( !zone_list.search(zcur,zpos) ) zone_list.append(zcur); } else if ( zone->children[pos].ztype < zone_type ) get_zones(zone_type, &zone->children[pos], zone_list); } } } GList DjVuTXT::find_text_with_rect(const GRect &box, GUTF8String &text, const int padding) const { GList retval; int text_start=0; int text_end=0; page_zone.get_text_with_rect(box,text_start,text_end); if(text_start != text_end) { GList zones; page_zone.find_zones(zones,text_start,text_end); GPosition pos=zones; if(pos) { do { if(padding >= 0) { zones[pos]->get_smallest(retval,padding); }else { zones[pos]->get_smallest(retval); } } while(++pos); } } text=textUTF8.substr(text_start,text_end-text_start); return retval; } GList DjVuTXT::find_text_in_rect(GRect target_rect, GUTF8String &text) const // returns a list of zones of type WORD in the nearest/selected paragraph { GList zone_list; GList lines; get_zones((int)PARAGRAPH, &page_zone, zone_list); // it's possible that no paragraph structure exists for reasons that // 1) ocr engine is not capable 2) file was modified by user. In such case, // we can only make a rough guess, i.e., select all the lines intersected with // target_rect if (zone_list.isempty()) { get_zones((int)LINE, &page_zone, zone_list); GPosition pos; for(pos=zone_list; pos; ++pos) { GRect rect=zone_list[pos]->rect; int h0=rect.height()/2; if(rect.intersect(rect,target_rect) && rect.height()>h0) lines.append(zone_list[pos]); } } else { GPosition pos, pos_sel=zone_list; float ar=0; for(pos=zone_list; pos; ++pos) { GRect rect=zone_list[pos]->rect; int area=rect.area(); if (rect.intersect(rect, target_rect)) { float ftmp=rect.area()/(float)area; if ( !ar || ar0 ) parag=zone_list[pos_sel]; zone_list.empty(); if ( ar>0 ) { get_zones((int)LINE, parag, zone_list); if ( !zone_list.isempty() ) { for(GPosition pos=zone_list; pos; ++pos) { GRect rect=zone_list[pos]->rect; int h0=rect.height()/2; if(rect.intersect(rect,target_rect) && rect.height()>h0) lines.append(zone_list[pos]); } } } } zone_list.empty(); if (!lines.isempty()) { int i=1, lsize=lines.size(); GList words; for (GPosition pos=lines; pos; ++pos, ++i) { words.empty(); get_zones((int)WORD, lines[pos], words); if ( lsize==1 ) { for(GPosition p=words;p;++p) { GRect rect=words[p]->rect; if(rect.intersect(rect,target_rect)) //if (target_rect.contains(words[p]->rect)) zone_list.append(words[p]); } } else { if (i==1) { bool start=true; for(GPosition p=words; p; ++p) { if ( start ) { GRect rect=words[p]->rect; if(rect.intersect(rect,target_rect)) //if (target_rect.contains(words[p]->rect)) { start=false; zone_list.append(words[p]); } } else zone_list.append(words[p]); } } else if (i==lsize) { bool end=true; for(GPosition p=words.lastpos();p;--p) { if ( end ) { GRect rect=words[p]->rect; if(rect.intersect(rect,target_rect)) //if(target_rect.contains(words[p]->rect) ) { end=false; zone_list.append(words[p]); } } else zone_list.append(words[p]); } } if (i!=1 && i!=lsize ) { for(GPosition p=words;p;++p) zone_list.append(words[p]); } } } } return zone_list; } unsigned int DjVuTXT::get_memory_usage() const { return sizeof(*this) + textUTF8.length() + page_zone.memuse() - sizeof(page_zone); } //*************************************************************************** //******************************** DjVuText ********************************* //*************************************************************************** void DjVuText::decode(const GP &gbs) { GUTF8String chkid; GP giff=IFFByteStream::create(gbs); IFFByteStream &iff=*giff; while( iff.get_chunk(chkid) ) { if (chkid == "TXTa") { if (txt) G_THROW( ERR_MSG("DjVuText.dupl_text") ); txt = DjVuTXT::create(); txt->decode(iff.get_bytestream()); } else if (chkid == "TXTz") { if (txt) G_THROW( ERR_MSG("DjVuText.dupl_text") ); txt = DjVuTXT::create(); const GP gbsiff=BSByteStream::create(iff.get_bytestream()); txt->decode(gbsiff); } // Add decoding of other chunks here iff.close_chunk(); } } void DjVuText::encode(const GP &gbs) { if (txt) { const GP giff=IFFByteStream::create(gbs); IFFByteStream &iff=*giff; iff.put_chunk("TXTz"); { GP gbsiff=BSByteStream::create(iff.get_bytestream(),50); txt->encode(gbsiff); } iff.close_chunk(); } // Add encoding of other chunks here } GP DjVuText::copy(void) const { GP text= new DjVuText; // Copy any primitives (if any) *text=*this; // Copy each substructure if (txt) text->txt = txt->copy(); return text; } static GUTF8String indent ( int spaces) { GUTF8String ret; for( int i = 0 ; i < spaces ; i++ ) ret += ' '; return ret; } static const char *tags[8]= { 0, "HIDDENTEXT", "PAGECOLUMN", "REGION", "PARAGRAPH", "LINE", "WORD", "CHARACTER" }; static const int tags_size=sizeof(tags)/sizeof(const char *); static GUTF8String start_tag(const DjVuTXT::ZoneType zone) { GUTF8String retval; if((tags_size > (int)zone)&&((int)zone > 0)) { switch (zone) { case DjVuTXT::CHARACTER: retval="<"+GUTF8String(tags[zone])+">"; break; case DjVuTXT::WORD: retval=indent(2*(int)zone+2)+"<"+tags[zone]+">"; break; default: retval=indent(2*(int)zone+2)+"<"+tags[zone]+">\n"; break; } } return retval; } static GUTF8String start_tag(const DjVuTXT::ZoneType zone, const GUTF8String &attributes) { GUTF8String retval; if((tags_size > (int)zone)&&((int)zone > 0)) { switch (zone) { case DjVuTXT::CHARACTER: retval="<"+GUTF8String(tags[zone])+" "+attributes+">"; break; case DjVuTXT::WORD: retval=indent(2*(int)zone+2)+"<"+tags[zone]+" "+attributes+">"; break; default: retval=indent(2*(int)zone+2)+"<"+tags[zone]+" "+attributes+">\n"; break; } } return retval; } static inline GUTF8String start_tag(const int layer) { return start_tag((const DjVuTXT::ZoneType)layer); } static GUTF8String end_tag(const DjVuTXT::ZoneType zone) { GUTF8String retval; if((tags_size > (int)zone)&&((int)zone >= 0)) { switch (zone) { case DjVuTXT::CHARACTER: retval=""; break; case DjVuTXT::WORD: retval="\n"; break; default: retval=indent(2*(int)zone+2)+"\n"; break; } } return retval; } static inline GUTF8String end_tag(const int layer) { return end_tag((const DjVuTXT::ZoneType)layer); } static GUTF8String tolayer(int &layer, const DjVuTXT::ZoneType next_layer) { GUTF8String retval; for( ;layer < (int)next_layer;layer++ ) { retval+=start_tag(layer); } while (layer > (int)next_layer ) { retval+=end_tag(--layer); } return retval; } static void writeText( ByteStream & str_out, const GUTF8String &textUTF8, const DjVuTXT::Zone &zone, const int WindowHeight ); static void writeText( ByteStream & str_out, const GUTF8String &textUTF8, const DjVuTXT::ZoneType zlayer, const GList &children, const int WindowHeight ) { // assert( txt->has_valid_zones() ); // DEBUG_MSG( "--zonetype=" << txt->page_zone.ztype << "\n" ); // Beginning tags for missing layers int layer=(int)zlayer; // Output the next layer for(GPosition pos=children ; pos ; ++pos ) { str_out.writestring(tolayer(layer,children[pos].ztype)); writeText( str_out, textUTF8, children[pos], WindowHeight ); } str_out.writestring(tolayer(layer,zlayer)); } static void writeText( ByteStream & str_out, const GUTF8String &textUTF8, const DjVuTXT::Zone &zone, const int WindowHeight ) { // DEBUG_MSG( "--zonetype=" << zone.ztype << "\n" ); const GUTF8String xindent(indent( 2 * zone.ztype + 2 )); GPosition pos=zone.children; // Build attribute string if( ! pos ) { GUTF8String coords; coords.format("coords=\"%d,%d,%d,%d\"", zone.rect.xmin, WindowHeight - 1 - zone.rect.ymin, zone.rect.xmax, WindowHeight - 1 - zone.rect.ymax); const int start=zone.text_start; const int end=textUTF8.firstEndSpace(start,zone.text_length); str_out.writestring(start_tag(zone.ztype,coords)); str_out.writestring(textUTF8.substr(start,end-start).toEscaped()); str_out.writestring(end_tag(zone.ztype)); } else { writeText(str_out,textUTF8,zone.ztype,zone.children,WindowHeight); } } void DjVuTXT::writeText(ByteStream &str_out,const int height) const { if(has_valid_zones()) { ::writeText(str_out,textUTF8,DjVuTXT::PAGE,page_zone.children,height); }else { str_out.writestring(start_tag(DjVuTXT::PAGE)); str_out.writestring(end_tag(DjVuTXT::PAGE)); } } void DjVuText::writeText(ByteStream &str_out,const int height) const { if(txt) { txt->writeText(str_out,height); }else { str_out.writestring("<"+GUTF8String(tags[DjVuTXT::PAGE])+"/>\n"); } } GUTF8String DjVuTXT::get_xmlText(const int height) const { GP gbs(ByteStream::create()); ByteStream &bs=*gbs; writeText(bs,height); bs.seek(0L); return bs.getAsUTF8(); } GUTF8String DjVuText::get_xmlText(const int height) const { GUTF8String retval; if(txt) { retval=txt->get_xmlText(height); }else { retval="<"+GUTF8String(tags[DjVuTXT::PAGE])+"/>\n"; } return retval; } #ifdef HAVE_NAMESPACES } # ifndef NOT_USING_DJVU_NAMESPACE using namespace DJVU; # endif #endif