• Skip to content
  • Skip to link menu
KDE 4.4 API Reference
  • KDE API Reference
  • KDE-PIM Libraries
  • Sitemap
  • Contact Us
 

KMIME Library

kmime_header_parsing.cpp

00001 /*  -*- c++ -*-
00002     kmime_header_parsing.cpp
00003 
00004     KMime, the KDE Internet mail/usenet news message library.
00005     Copyright (c) 2001-2002 Marc Mutz <mutz@kde.org>
00006 
00007     This library is free software; you can redistribute it and/or
00008     modify it under the terms of the GNU Library General Public
00009     License as published by the Free Software Foundation; either
00010     version 2 of the License, or (at your option) any later version.
00011 
00012     This library is distributed in the hope that it will be useful,
00013     but WITHOUT ANY WARRANTY; without even the implied warranty of
00014     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00015     Library General Public License for more details.
00016 
00017     You should have received a copy of the GNU Library General Public License
00018     along with this library; see the file COPYING.LIB.  If not, write to
00019     the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
00020     Boston, MA 02110-1301, USA.
00021 */
00022 
00023 #include "kmime_header_parsing.h"
00024 
00025 #include "kmime_codecs.h"
00026 #include "kmime_headerfactory_p.h"
00027 #include "kmime_headers.h"
00028 #include "kmime_util.h"
00029 #include "kmime_dateformatter.h"
00030 #include "kmime_warning.h"
00031 
00032 #include <kglobal.h>
00033 #include <kcharsets.h>
00034 
00035 #include <QtCore/QTextCodec>
00036 #include <QtCore/QMap>
00037 #include <QtCore/QStringList>
00038 #include <QtCore/QUrl>
00039 
00040 #include <ctype.h> // for isdigit
00041 #include <cassert>
00042 
00043 using namespace KMime;
00044 using namespace KMime::Types;
00045 
00046 namespace KMime {
00047 
00048 namespace Types {
00049 
00050 // QUrl::fromAce is extremely expensive, so only use it when necessary.
00051 // Fortunately, the presence of IDNA is readily detected with a substring match...
00052 static inline QString QUrl_fromAce_wrapper( const QString & domain )
00053 {
00054     if ( domain.contains( QLatin1String( "xn--" ) ) )
00055         return QUrl::fromAce( domain.toLatin1() );
00056     else
00057         return domain;
00058 }
00059 
00060 static QString addr_spec_as_string( const AddrSpec & as, bool pretty )
00061 {
00062   if ( as.isEmpty() ) {
00063     return QString();
00064   }
00065 
00066   bool needsQuotes = false;
00067   QString result;
00068   result.reserve( as.localPart.length() + as.domain.length() + 1 );
00069   for ( int i = 0 ; i < as.localPart.length() ; ++i ) {
00070     const char ch = as.localPart[i].toLatin1();
00071     if ( ch == '.' || isAText( ch ) ) {
00072       result += ch;
00073     } else {
00074       needsQuotes = true;
00075       if ( ch == '\\' || ch == '"' ) {
00076         result += '\\';
00077       }
00078       result += ch;
00079     }
00080   }
00081   const QString dom = pretty ? QUrl_fromAce_wrapper( as.domain ) : as.domain ;
00082   if ( needsQuotes ) {
00083     result = '"' + result + "\"";
00084   }
00085   if( dom.isEmpty() ) {
00086     return result;
00087   } else {
00088     return result + '@' + dom;
00089   }
00090 }
00091 
00092 QString AddrSpec::asString() const
00093 {
00094     return addr_spec_as_string( *this, false );
00095 }
00096 
00097 QString AddrSpec::asPrettyString() const
00098 {
00099     return addr_spec_as_string( *this, true );
00100 }
00101 
00102 bool AddrSpec::isEmpty() const
00103 {
00104   return localPart.isEmpty() && domain.isEmpty();
00105 }
00106 
00107 QByteArray Mailbox::address() const
00108 {
00109   return mAddrSpec.asString().toLatin1();
00110 }
00111 
00112 AddrSpec Mailbox::addrSpec() const
00113 {
00114   return mAddrSpec;
00115 }
00116 
00117 QString Mailbox::name() const
00118 {
00119   return mDisplayName;
00120 }
00121 
00122 void Mailbox::setAddress( const AddrSpec &addr )
00123 {
00124   mAddrSpec = addr;
00125 }
00126 
00127 void Mailbox::setAddress( const QByteArray &addr )
00128 {
00129   const char *cursor = addr.constData();
00130   if ( !HeaderParsing::parseAngleAddr( cursor,
00131                                        cursor + addr.length(), mAddrSpec ) ) {
00132     if ( !HeaderParsing::parseAddrSpec( cursor, cursor + addr.length(),
00133                                         mAddrSpec ) ) {
00134       kWarning() << "Invalid address";
00135       return;
00136     }
00137   }
00138 }
00139 
00140 void Mailbox::setName( const QString &name )
00141 {
00142   mDisplayName = removeBidiControlChars( name );
00143 }
00144 
00145 void Mailbox::setNameFrom7Bit( const QByteArray &name,
00146                                const QByteArray &defaultCharset )
00147 {
00148   QByteArray cs;
00149   setName( decodeRFC2047String( name, cs, defaultCharset, false ) );
00150 }
00151 
00152 bool Mailbox::hasAddress() const
00153 {
00154   return !mAddrSpec.isEmpty();
00155 }
00156 
00157 bool Mailbox::hasName() const
00158 {
00159   return !mDisplayName.isEmpty();
00160 }
00161 
00162 QString Mailbox::prettyAddress() const
00163 {
00164   if ( !hasName() ) {
00165     return address();
00166   }
00167   QString s = name();
00168   if ( hasAddress() ) {
00169     s += QLatin1String(" <") + address() + QLatin1Char('>');
00170   }
00171   return s;
00172 }
00173 
00174 void Mailbox::fromUnicodeString( const QString &s )
00175 {
00176   from7BitString( encodeRFC2047String( s, "utf-8", false ) );
00177 }
00178 
00179 void Mailbox::from7BitString( const QByteArray &s )
00180 {
00181   const char *cursor = s.constData();
00182   HeaderParsing::parseMailbox( cursor, cursor + s.length(), *this );
00183 }
00184 
00185 QByteArray KMime::Types::Mailbox::as7BitString( const QByteArray &encCharset ) const
00186 {
00187   if ( !hasName() ) {
00188     return address();
00189   }
00190   QByteArray rv;
00191   if ( isUsAscii( name() ) ) {
00192     QByteArray tmp = name().toLatin1();
00193     addQuotes( tmp, false );
00194     rv += tmp;
00195   } else {
00196     rv += encodeRFC2047String( name(), encCharset, true );
00197   }
00198   if ( hasAddress() ) {
00199     rv += " <" + address() + '>';
00200   }
00201   return rv;
00202 }
00203 
00204 } // namespace Types
00205 
00206 namespace HeaderParsing {
00207 
00208 // parse the encoded-word (scursor points to after the initial '=')
00209 bool parseEncodedWord( const char* &scursor, const char * const send,
00210                        QString &result, QByteArray &language,
00211                        QByteArray &usedCS, const QByteArray &defaultCS,
00212                        bool forceCS )
00213 {
00214   // make sure the caller already did a bit of the work.
00215   assert( *(scursor-1) == '=' );
00216 
00217   //
00218   // STEP 1:
00219   // scan for the charset/language portion of the encoded-word
00220   //
00221 
00222   char ch = *scursor++;
00223 
00224   if ( ch != '?' ) {
00225     // kDebug() << "first";
00226     //KMIME_WARN_PREMATURE_END_OF( EncodedWord );
00227     return false;
00228   }
00229 
00230   // remember start of charset (ie. just after the initial "=?") and
00231   // language (just after the first '*') fields:
00232   const char * charsetStart = scursor;
00233   const char * languageStart = 0;
00234 
00235   // find delimiting '?' (and the '*' separating charset and language
00236   // tags, if any):
00237   for ( ; scursor != send ; scursor++ ) {
00238     if ( *scursor == '?') {
00239       break;
00240     } else if ( *scursor == '*' && languageStart == 0 ) {
00241       languageStart = scursor + 1;
00242     }
00243   }
00244 
00245   // not found? can't be an encoded-word!
00246   if ( scursor == send || *scursor != '?' ) {
00247     // kDebug() << "second";
00248     KMIME_WARN_PREMATURE_END_OF( EncodedWord );
00249     return false;
00250   }
00251 
00252   // extract the language information, if any (if languageStart is 0,
00253   // language will be null, too):
00254   QByteArray maybeLanguage( languageStart, scursor - languageStart );
00255   // extract charset information (keep in mind: the size given to the
00256   // ctor is one off due to the \0 terminator):
00257   QByteArray maybeCharset( charsetStart,
00258                            ( languageStart ? languageStart - 1 : scursor ) - charsetStart );
00259 
00260   //
00261   // STEP 2:
00262   // scan for the encoding portion of the encoded-word
00263   //
00264 
00265   // remember start of encoding (just _after_ the second '?'):
00266   scursor++;
00267   const char * encodingStart = scursor;
00268 
00269   // find next '?' (ending the encoding tag):
00270   for ( ; scursor != send ; scursor++ ) {
00271     if ( *scursor == '?' ) {
00272       break;
00273     }
00274   }
00275 
00276   // not found? Can't be an encoded-word!
00277   if ( scursor == send || *scursor != '?' ) {
00278     // kDebug() << "third";
00279     KMIME_WARN_PREMATURE_END_OF( EncodedWord );
00280     return false;
00281   }
00282 
00283   // extract the encoding information:
00284   QByteArray maybeEncoding( encodingStart, scursor - encodingStart );
00285 
00286   // kDebug() << "parseEncodedWord: found charset == \"" << maybeCharset
00287   //         << "\"; language == \"" << maybeLanguage
00288   //         << "\"; encoding == \"" << maybeEncoding << "\"";
00289 
00290   //
00291   // STEP 3:
00292   // scan for encoded-text portion of encoded-word
00293   //
00294 
00295   // remember start of encoded-text (just after the third '?'):
00296   scursor++;
00297   const char * encodedTextStart = scursor;
00298 
00299   // find the '?=' sequence (ending the encoded-text):
00300   for ( ; scursor != send ; scursor++ ) {
00301     if ( *scursor == '?' ) {
00302       if ( scursor + 1 != send ) {
00303         if ( *( scursor + 1 ) != '=' ) { // We expect a '=' after the '?', but we got something else; ignore
00304           KMIME_WARN << "Stray '?' in q-encoded word, ignoring this.";
00305           continue;
00306         }
00307         else { // yep, found a '?=' sequence
00308           scursor += 2;
00309           break;
00310         }
00311       }
00312       else { // The '?' is the last char, but we need a '=' after it!
00313         KMIME_WARN_PREMATURE_END_OF( EncodedWord );
00314         return false;
00315       }
00316     }
00317   }
00318 
00319   if ( *( scursor - 2 ) != '?' || *( scursor - 1 ) != '=' ||
00320        scursor < encodedTextStart + 2 ) {
00321     KMIME_WARN_PREMATURE_END_OF( EncodedWord );
00322     return false;
00323   }
00324 
00325   // set end sentinel for encoded-text:
00326   const char * const encodedTextEnd = scursor - 2;
00327 
00328   //
00329   // STEP 4:
00330   // setup decoders for the transfer encoding and the charset
00331   //
00332 
00333   // try if there's a codec for the encoding found:
00334   Codec * codec = Codec::codecForName( maybeEncoding );
00335   if ( !codec ) {
00336     KMIME_WARN_UNKNOWN( Encoding, maybeEncoding );
00337     return false;
00338   }
00339 
00340   // get an instance of a corresponding decoder:
00341   Decoder * dec = codec->makeDecoder();
00342   assert( dec );
00343 
00344   // try if there's a (text)codec for the charset found:
00345   bool matchOK = false;
00346   QTextCodec *textCodec = 0;
00347   if ( forceCS || maybeCharset.isEmpty() ) {
00348     textCodec = KGlobal::charsets()->codecForName( defaultCS, matchOK );
00349     usedCS = cachedCharset( defaultCS );
00350   } else {
00351     textCodec = KGlobal::charsets()->codecForName( maybeCharset, matchOK );
00352     if ( !matchOK ) {  //no suitable codec found => use default charset
00353       textCodec = KGlobal::charsets()->codecForName( defaultCS, matchOK );
00354       usedCS = cachedCharset( defaultCS );
00355     } else {
00356       usedCS = cachedCharset( maybeCharset );
00357     }
00358   }
00359 
00360   if ( !matchOK || !textCodec ) {
00361     KMIME_WARN_UNKNOWN( Charset, maybeCharset );
00362     delete dec;
00363     return false;
00364   };
00365 
00366   // kDebug() << "mimeName(): \"" << textCodec->name() << "\"";
00367 
00368   // allocate a temporary buffer to store the 8bit text:
00369   int encodedTextLength = encodedTextEnd - encodedTextStart;
00370   QByteArray buffer;
00371   buffer.resize( codec->maxDecodedSizeFor( encodedTextLength ) );
00372   char *bbegin = buffer.data();
00373   char *bend = bbegin + buffer.length();
00374 
00375   //
00376   // STEP 5:
00377   // do the actual decoding
00378   //
00379 
00380   if ( !dec->decode( encodedTextStart, encodedTextEnd, bbegin, bend ) ) {
00381     KMIME_WARN << codec->name() << "codec lies about its maxDecodedSizeFor("
00382                << encodedTextLength << ")\nresult may be truncated";
00383   }
00384 
00385   result = textCodec->toUnicode( buffer.data(), bbegin - buffer.data() );
00386 
00387   // kDebug() << "result now: \"" << result << "\"";
00388   // cleanup:
00389   delete dec;
00390   language = maybeLanguage;
00391 
00392   return true;
00393 }
00394 
00395 static inline void eatWhiteSpace( const char* &scursor, const char * const send )
00396 {
00397   while ( scursor != send &&
00398           ( *scursor == ' ' || *scursor == '\n' ||
00399             *scursor == '\t' || *scursor == '\r' ) )
00400     scursor++;
00401 }
00402 
00403 bool parseAtom( const char * &scursor, const char * const send,
00404                 QString &result, bool allow8Bit )
00405 {
00406   QPair<const char*,int> maybeResult;
00407 
00408   if ( parseAtom( scursor, send, maybeResult, allow8Bit ) ) {
00409     result += QString::fromLatin1( maybeResult.first, maybeResult.second );
00410     return true;
00411   }
00412 
00413   return false;
00414 }
00415 
00416 bool parseAtom( const char * &scursor, const char * const send,
00417                 QPair<const char*,int> &result, bool allow8Bit )
00418 {
00419   bool success = false;
00420   const char *start = scursor;
00421 
00422   while ( scursor != send ) {
00423     signed char ch = *scursor++;
00424     if ( ch > 0 && isAText( ch ) ) {
00425       // AText: OK
00426       success = true;
00427     } else if ( allow8Bit && ch < 0 ) {
00428       // 8bit char: not OK, but be tolerant.
00429       KMIME_WARN_8BIT( ch );
00430       success = true;
00431     } else {
00432       // CTL or special - marking the end of the atom:
00433       // re-set sursor to point to the offending
00434       // char and return:
00435       scursor--;
00436       break;
00437     }
00438   }
00439   result.first = start;
00440   result.second = scursor - start;
00441   return success;
00442 }
00443 
00444 bool parseToken( const char * &scursor, const char * const send,
00445                  QString &result, bool allow8Bit )
00446 {
00447   QPair<const char*,int> maybeResult;
00448 
00449   if ( parseToken( scursor, send, maybeResult, allow8Bit ) ) {
00450     result += QString::fromLatin1( maybeResult.first, maybeResult.second );
00451     return true;
00452   }
00453 
00454   return false;
00455 }
00456 
00457 bool parseToken( const char * &scursor, const char * const send,
00458                  QPair<const char*,int> &result, bool allow8Bit )
00459 {
00460   bool success = false;
00461   const char * start = scursor;
00462 
00463   while ( scursor != send ) {
00464     signed char ch = *scursor++;
00465     if ( ch > 0 && isTText( ch ) ) {
00466       // TText: OK
00467       success = true;
00468     } else if ( allow8Bit && ch < 0 ) {
00469       // 8bit char: not OK, but be tolerant.
00470       KMIME_WARN_8BIT( ch );
00471       success = true;
00472     } else {
00473       // CTL or tspecial - marking the end of the atom:
00474       // re-set sursor to point to the offending
00475       // char and return:
00476       scursor--;
00477       break;
00478     }
00479   }
00480   result.first = start;
00481   result.second = scursor - start;
00482   return success;
00483 }
00484 
00485 #define READ_ch_OR_FAIL if ( scursor == send ) {        \
00486     KMIME_WARN_PREMATURE_END_OF( GenericQuotedString ); \
00487     return false;                                       \
00488   } else {                                              \
00489     ch = *scursor++;                                    \
00490   }
00491 
00492 // known issues:
00493 //
00494 // - doesn't handle quoted CRLF
00495 
00496 bool parseGenericQuotedString( const char* &scursor, const char * const send,
00497                                QString &result, bool isCRLF,
00498                                const char openChar, const char closeChar )
00499 {
00500   char ch;
00501   // We are in a quoted-string or domain-literal or comment and the
00502   // cursor points to the first char after the openChar.
00503   // We will apply unfolding and quoted-pair removal.
00504   // We return when we either encounter the end or unescaped openChar
00505   // or closeChar.
00506 
00507   assert( *(scursor-1) == openChar || *(scursor-1) == closeChar );
00508 
00509   while ( scursor != send ) {
00510     ch = *scursor++;
00511 
00512     if ( ch == closeChar || ch == openChar ) {
00513       // end of quoted-string or another opening char:
00514       // let caller decide what to do.
00515       return true;
00516     }
00517 
00518     switch( ch ) {
00519     case '\\':      // quoted-pair
00520       // misses "\" CRLF LWSP-char handling, see rfc822, 3.4.5
00521       READ_ch_OR_FAIL;
00522       KMIME_WARN_IF_8BIT( ch );
00523       result += QChar( ch );
00524       break;
00525     case '\r':
00526       // ###
00527       // The case of lonely '\r' is easy to solve, as they're
00528       // not part of Unix Line-ending conventions.
00529       // But I see a problem if we are given Unix-native
00530       // line-ending-mails, where we cannot determine anymore
00531       // whether a given '\n' was part of a CRLF or was occurring
00532       // on it's own.
00533       READ_ch_OR_FAIL;
00534       if ( ch != '\n' ) {
00535         // CR on it's own...
00536         KMIME_WARN_LONE( CR );
00537         result += QChar('\r');
00538         scursor--; // points to after the '\r' again
00539       } else {
00540         // CRLF encountered.
00541         // lookahead: check for folding
00542         READ_ch_OR_FAIL;
00543         if ( ch == ' ' || ch == '\t' ) {
00544           // correct folding;
00545           // position cursor behind the CRLF WSP (unfolding)
00546           // and add the WSP to the result
00547           result += QChar( ch );
00548         } else {
00549           // this is the "shouldn't happen"-case. There is a CRLF
00550           // inside a quoted-string without it being part of FWS.
00551           // We take it verbatim.
00552           KMIME_WARN_NON_FOLDING( CRLF );
00553           result += "\r\n";
00554           // the cursor is decremented again, so's we need not
00555           // duplicate the whole switch here. "ch" could've been
00556           // everything (incl. openChar or closeChar).
00557           scursor--;
00558         }
00559       }
00560       break;
00561     case '\n':
00562       // Note: CRLF has been handled above already!
00563       // ### LF needs special treatment, depending on whether isCRLF
00564       // is true (we can be sure a lonely '\n' was meant this way) or
00565       // false ('\n' alone could have meant LF or CRLF in the original
00566       // message. This parser assumes CRLF iff the LF is followed by
00567       // either WSP (folding) or NULL (premature end of quoted-string;
00568       // Should be fixed, since NULL is allowed as per rfc822).
00569       READ_ch_OR_FAIL;
00570       if ( !isCRLF && ( ch == ' ' || ch == '\t' ) ) {
00571         // folding
00572         // correct folding
00573         result += QChar( ch );
00574       } else {
00575         // non-folding
00576         KMIME_WARN_LONE( LF );
00577         result += QChar('\n');
00578         // pos is decremented, so's we need not duplicate the whole
00579         // switch here. ch could've been everything (incl. <">, "\").
00580         scursor--;
00581       }
00582       break;
00583     default:
00584       KMIME_WARN_IF_8BIT( ch );
00585       result += QChar( ch );
00586     }
00587   }
00588 
00589   return false;
00590 }
00591 
00592 // known issues:
00593 //
00594 // - doesn't handle encoded-word inside comments.
00595 
00596 bool parseComment( const char* &scursor, const char * const send,
00597                    QString &result, bool isCRLF, bool reallySave )
00598 {
00599   int commentNestingDepth = 1;
00600   const char *afterLastClosingParenPos = 0;
00601   QString maybeCmnt;
00602   const char *oldscursor = scursor;
00603 
00604   assert( *(scursor-1) == '(' );
00605 
00606   while ( commentNestingDepth ) {
00607     QString cmntPart;
00608     if ( parseGenericQuotedString( scursor, send, cmntPart, isCRLF, '(', ')' ) ) {
00609       assert( *(scursor-1) == ')' || *(scursor-1) == '(' );
00610       // see the kdoc for above function for the possible conditions
00611       // we have to check:
00612       switch ( *(scursor-1) ) {
00613       case ')':
00614         if ( reallySave ) {
00615           // add the chunk that's now surely inside the comment.
00616           result += maybeCmnt;
00617           result += cmntPart;
00618           if ( commentNestingDepth > 1 ) {
00619             // don't add the outermost ')'...
00620             result += QChar(')');
00621           }
00622           maybeCmnt.clear();
00623         }
00624         afterLastClosingParenPos = scursor;
00625         --commentNestingDepth;
00626         break;
00627       case '(':
00628         if ( reallySave ) {
00629           // don't add to "result" yet, because we might find that we
00630           // are already outside the (broken) comment...
00631           maybeCmnt += cmntPart;
00632           maybeCmnt += QChar('(');
00633         }
00634         ++commentNestingDepth;
00635         break;
00636       default: assert( 0 );
00637       } // switch
00638     } else {
00639       // !parseGenericQuotedString, ie. premature end
00640       if ( afterLastClosingParenPos ) {
00641         scursor = afterLastClosingParenPos;
00642       } else {
00643         scursor = oldscursor;
00644       }
00645       return false;
00646     }
00647   } // while
00648 
00649   return true;
00650 }
00651 
00652 // known issues: none.
00653 
00654 bool parsePhrase( const char* &scursor, const char * const send,
00655                   QString &result, bool isCRLF )
00656 {
00657   enum {
00658     None, Phrase, Atom, EncodedWord, QuotedString
00659   } found = None;
00660 
00661   QString tmp;
00662   QByteArray lang, charset;
00663   const char *successfullyParsed = 0;
00664   // only used by the encoded-word branch
00665   const char *oldscursor;
00666   // used to suppress whitespace between adjacent encoded-words
00667   // (rfc2047, 6.2):
00668   bool lastWasEncodedWord = false;
00669 
00670   while ( scursor != send ) {
00671     char ch = *scursor++;
00672     switch ( ch ) {
00673     case '.': // broken, but allow for intorop's sake
00674       if ( found == None ) {
00675         --scursor;
00676         return false;
00677       } else {
00678         if ( scursor != send && ( *scursor == ' ' || *scursor == '\t' ) ) {
00679           result += ". ";
00680         } else {
00681           result += '.';
00682         }
00683         successfullyParsed = scursor;
00684       }
00685       break;
00686     case '"': // quoted-string
00687       tmp.clear();
00688       if ( parseGenericQuotedString( scursor, send, tmp, isCRLF, '"', '"' ) ) {
00689         successfullyParsed = scursor;
00690         assert( *(scursor-1) == '"' );
00691         switch ( found ) {
00692         case None:
00693           found = QuotedString;
00694           break;
00695         case Phrase:
00696         case Atom:
00697         case EncodedWord:
00698         case QuotedString:
00699           found = Phrase;
00700           result += QChar(' '); // rfc822, 3.4.4
00701           break;
00702         default:
00703           assert( 0 );
00704         }
00705         lastWasEncodedWord = false;
00706         result += tmp;
00707       } else {
00708         // premature end of quoted string.
00709         // What to do? Return leading '"' as special? Return as quoted-string?
00710         // We do the latter if we already found something, else signal failure.
00711         if ( found == None ) {
00712           return false;
00713         } else {
00714           result += QChar(' '); // rfc822, 3.4.4
00715           result += tmp;
00716           return true;
00717         }
00718       }
00719       break;
00720     case '(': // comment
00721       // parse it, but ignore content:
00722       tmp.clear();
00723       if ( parseComment( scursor, send, tmp, isCRLF,
00724                          false /*don't bother with the content*/ ) ) {
00725         successfullyParsed = scursor;
00726         lastWasEncodedWord = false; // strictly interpreting rfc2047, 6.2
00727       } else {
00728         if ( found == None ) {
00729           return false;
00730         } else {
00731           scursor = successfullyParsed;
00732           return true;
00733         }
00734       }
00735       break;
00736     case '=': // encoded-word
00737       tmp.clear();
00738       oldscursor = scursor;
00739       lang.clear();
00740       charset.clear();
00741       if ( parseEncodedWord( scursor, send, tmp, lang, charset ) ) {
00742         successfullyParsed = scursor;
00743         switch ( found ) {
00744         case None:
00745           found = EncodedWord;
00746           break;
00747         case Phrase:
00748         case EncodedWord:
00749         case Atom:
00750         case QuotedString:
00751           if ( !lastWasEncodedWord ) {
00752             result += QChar(' '); // rfc822, 3.4.4
00753           }
00754           found = Phrase;
00755           break;
00756         default: assert( 0 );
00757         }
00758         lastWasEncodedWord = true;
00759         result += tmp;
00760         break;
00761       } else {
00762         // parse as atom:
00763         scursor = oldscursor;
00764       }
00765       // fall though...
00766 
00767     default: //atom
00768       tmp.clear();
00769       scursor--;
00770       if ( parseAtom( scursor, send, tmp, true /* allow 8bit */ ) ) {
00771         successfullyParsed = scursor;
00772         switch ( found ) {
00773         case None:
00774           found = Atom;
00775           break;
00776         case Phrase:
00777         case Atom:
00778         case EncodedWord:
00779         case QuotedString:
00780           found = Phrase;
00781           result += QChar(' '); // rfc822, 3.4.4
00782           break;
00783         default:
00784           assert( 0 );
00785         }
00786         lastWasEncodedWord = false;
00787         result += tmp;
00788       } else {
00789         if ( found == None ) {
00790           return false;
00791         } else {
00792           scursor = successfullyParsed;
00793           return true;
00794         }
00795       }
00796     }
00797     eatWhiteSpace( scursor, send );
00798   }
00799 
00800   return found != None;
00801 }
00802 
00803 bool parseDotAtom( const char* &scursor, const char * const send,
00804                    QString &result, bool isCRLF )
00805 {
00806   eatCFWS( scursor, send, isCRLF );
00807 
00808   // always points to just after the last atom parsed:
00809   const char *successfullyParsed;
00810 
00811   QString tmp;
00812   if ( !parseAtom( scursor, send, tmp, false /* no 8bit */ ) ) {
00813     return false;
00814   }
00815   result += tmp;
00816   successfullyParsed = scursor;
00817 
00818   while ( scursor != send ) {
00819 
00820     // end of header or no '.' -> return
00821     if ( scursor == send || *scursor != '.' ) {
00822       return true;
00823     }
00824     scursor++; // eat '.'
00825 
00826     if ( scursor == send || !isAText( *scursor ) ) {
00827       // end of header or no AText, but this time following a '.'!:
00828       // reset cursor to just after last successfully parsed char and
00829       // return:
00830       scursor = successfullyParsed;
00831       return true;
00832     }
00833 
00834     // try to parse the next atom:
00835     QString maybeAtom;
00836     if ( !parseAtom( scursor, send, maybeAtom, false /*no 8bit*/ ) ) {
00837       scursor = successfullyParsed;
00838       return true;
00839     }
00840 
00841     result += QChar('.');
00842     result += maybeAtom;
00843     successfullyParsed = scursor;
00844   }
00845 
00846   scursor = successfullyParsed;
00847   return true;
00848 }
00849 
00850 void eatCFWS( const char* &scursor, const char * const send, bool isCRLF )
00851 {
00852   QString dummy;
00853 
00854   while ( scursor != send ) {
00855     const char *oldscursor = scursor;
00856 
00857     char ch = *scursor++;
00858 
00859     switch( ch ) {
00860     case ' ':
00861     case '\t': // whitespace
00862     case '\r':
00863     case '\n': // folding
00864       continue;
00865 
00866     case '(': // comment
00867       if ( parseComment( scursor, send, dummy, isCRLF, false /*don't save*/ ) ) {
00868         continue;
00869       }
00870       scursor = oldscursor;
00871       return;
00872 
00873     default:
00874       scursor = oldscursor;
00875       return;
00876     }
00877   }
00878 }
00879 
00880 bool parseDomain( const char* &scursor, const char * const send,
00881                   QString &result, bool isCRLF )
00882 {
00883   eatCFWS( scursor, send, isCRLF );
00884   if ( scursor == send ) {
00885     return false;
00886   }
00887 
00888   // domain := dot-atom / domain-literal / atom *("." atom)
00889   //
00890   // equivalent to:
00891   // domain = dot-atom / domain-literal,
00892   // since parseDotAtom does allow CFWS between atoms and dots
00893 
00894   if ( *scursor == '[' ) {
00895     // domain-literal:
00896     QString maybeDomainLiteral;
00897     // eat '[':
00898     scursor++;
00899     while ( parseGenericQuotedString( scursor, send, maybeDomainLiteral,
00900                                       isCRLF, '[', ']' ) ) {
00901       if ( scursor == send ) {
00902         // end of header: check for closing ']':
00903         if ( *(scursor-1) == ']' ) {
00904           // OK, last char was ']':
00905           result = maybeDomainLiteral;
00906           return true;
00907         } else {
00908           // not OK, domain-literal wasn't closed:
00909           return false;
00910         }
00911       }
00912       // we hit openChar in parseGenericQuotedString.
00913       // include it in maybeDomainLiteral and keep on parsing:
00914       if ( *(scursor-1) == '[' ) {
00915         maybeDomainLiteral += QChar('[');
00916         continue;
00917       }
00918       // OK, real end of domain-literal:
00919       result = maybeDomainLiteral;
00920       return true;
00921     }
00922   } else {
00923     // dot-atom:
00924     QString maybeDotAtom;
00925     if ( parseDotAtom( scursor, send, maybeDotAtom, isCRLF ) ) {
00926       result = maybeDotAtom;
00927       // Domain may end with '.', if so preserve it'
00928       if ( scursor != send && *scursor == '.' ) {
00929         result += QChar('.');
00930         scursor++;
00931       }
00932       return true;
00933     }
00934   }
00935   return false;
00936 }
00937 
00938 bool parseObsRoute( const char* &scursor, const char* const send,
00939                     QStringList &result, bool isCRLF, bool save )
00940 {
00941   while ( scursor != send ) {
00942     eatCFWS( scursor, send, isCRLF );
00943     if ( scursor == send ) {
00944       return false;
00945     }
00946 
00947     // empty entry:
00948     if ( *scursor == ',' ) {
00949       scursor++;
00950       if ( save ) {
00951         result.append( QString() );
00952       }
00953       continue;
00954     }
00955 
00956     // empty entry ending the list:
00957     if ( *scursor == ':' ) {
00958       scursor++;
00959       if ( save ) {
00960         result.append( QString() );
00961       }
00962       return true;
00963     }
00964 
00965     // each non-empty entry must begin with '@':
00966     if ( *scursor != '@' ) {
00967       return false;
00968     } else {
00969       scursor++;
00970     }
00971 
00972     QString maybeDomain;
00973     if ( !parseDomain( scursor, send, maybeDomain, isCRLF ) ) {
00974       return false;
00975     }
00976     if ( save ) {
00977       result.append( maybeDomain );
00978     }
00979 
00980     // eat the following (optional) comma:
00981     eatCFWS( scursor, send, isCRLF );
00982     if ( scursor == send ) {
00983       return false;
00984     }
00985     if ( *scursor == ':' ) {
00986       scursor++;
00987       return true;
00988     }
00989     if ( *scursor == ',' ) {
00990       scursor++;
00991     }
00992   }
00993 
00994   return false;
00995 }
00996 
00997 bool parseAddrSpec( const char* &scursor, const char * const send,
00998                     AddrSpec &result, bool isCRLF )
00999 {
01000   //
01001   // STEP 1:
01002   // local-part := dot-atom / quoted-string / word *("." word)
01003   //
01004   // this is equivalent to:
01005   // local-part := word *("." word)
01006 
01007   QString maybeLocalPart;
01008   QString tmp;
01009 
01010   while ( scursor != send ) {
01011     // first, eat any whitespace
01012     eatCFWS( scursor, send, isCRLF );
01013 
01014     char ch = *scursor++;
01015     switch ( ch ) {
01016     case '.': // dot
01017       maybeLocalPart += QChar('.');
01018       break;
01019 
01020     case '@':
01021       goto SAW_AT_SIGN;
01022       break;
01023 
01024     case '"': // quoted-string
01025       tmp.clear();
01026       if ( parseGenericQuotedString( scursor, send, tmp, isCRLF, '"', '"' ) ) {
01027         maybeLocalPart += tmp;
01028       } else {
01029         return false;
01030       }
01031       break;
01032 
01033     default: // atom
01034       scursor--; // re-set scursor to point to ch again
01035       tmp.clear();
01036       if ( parseAtom( scursor, send, tmp, false /* no 8bit */ ) ) {
01037         maybeLocalPart += tmp;
01038       } else {
01039         return false; // parseAtom can only fail if the first char is non-atext.
01040       }
01041       break;
01042     }
01043   }
01044 
01045   return false;
01046 
01047   //
01048   // STEP 2:
01049   // domain
01050   //
01051 
01052 SAW_AT_SIGN:
01053 
01054   assert( *(scursor-1) == '@' );
01055 
01056   QString maybeDomain;
01057   if ( !parseDomain( scursor, send, maybeDomain, isCRLF ) ) {
01058     return false;
01059   }
01060 
01061   result.localPart = maybeLocalPart;
01062   result.domain = maybeDomain;
01063 
01064   return true;
01065 }
01066 
01067 bool parseAngleAddr( const char* &scursor, const char * const send,
01068                      AddrSpec &result, bool isCRLF )
01069 {
01070   // first, we need an opening angle bracket:
01071   eatCFWS( scursor, send, isCRLF );
01072   if ( scursor == send || *scursor != '<' ) {
01073     return false;
01074   }
01075   scursor++; // eat '<'
01076 
01077   eatCFWS( scursor, send, isCRLF );
01078   if ( scursor == send ) {
01079     return false;
01080   }
01081 
01082   if ( *scursor == '@' || *scursor == ',' ) {
01083     // obs-route: parse, but ignore:
01084     KMIME_WARN << "obsolete source route found! ignoring.";
01085     QStringList dummy;
01086     if ( !parseObsRoute( scursor, send, dummy,
01087                          isCRLF, false /* don't save */ ) ) {
01088       return false;
01089     }
01090     // angle-addr isn't complete until after the '>':
01091     if ( scursor == send ) {
01092       return false;
01093     }
01094   }
01095 
01096   // parse addr-spec:
01097   AddrSpec maybeAddrSpec;
01098   if ( !parseAddrSpec( scursor, send, maybeAddrSpec, isCRLF ) ) {
01099     return false;
01100   }
01101 
01102   eatCFWS( scursor, send, isCRLF );
01103   if ( scursor == send || *scursor != '>' ) {
01104     return false;
01105   }
01106   scursor++;
01107 
01108   result = maybeAddrSpec;
01109   return true;
01110 
01111 }
01112 
01113 bool parseMailbox( const char* &scursor, const char * const send,
01114                    Mailbox &result, bool isCRLF )
01115 {
01116   eatCFWS( scursor, send, isCRLF );
01117   if ( scursor == send ) {
01118     return false;
01119   }
01120 
01121   AddrSpec maybeAddrSpec;
01122   QString maybeDisplayName;
01123 
01124   // first, try if it's a vanilla addr-spec:
01125   const char * oldscursor = scursor;
01126   if ( parseAddrSpec( scursor, send, maybeAddrSpec, isCRLF ) ) {
01127     result.setAddress( maybeAddrSpec );
01128     // check for the obsolete form of display-name (as comment):
01129     eatWhiteSpace( scursor, send );
01130     if ( scursor != send && *scursor == '(' ) {
01131       scursor++;
01132       if ( !parseComment( scursor, send, maybeDisplayName, isCRLF, true /*keep*/ ) ) {
01133         return false;
01134       }
01135     }
01136     result.setNameFrom7Bit( maybeDisplayName.toLatin1() );
01137     return true;
01138   }
01139   scursor = oldscursor;
01140 
01141   // second, see if there's a display-name:
01142   if ( !parsePhrase( scursor, send, maybeDisplayName, isCRLF ) ) {
01143     // failed: reset cursor, note absent display-name
01144     maybeDisplayName.clear();
01145     scursor = oldscursor;
01146   } else {
01147     // succeeded: eat CFWS
01148     eatCFWS( scursor, send, isCRLF );
01149     if ( scursor == send ) {
01150       return false;
01151     }
01152   }
01153 
01154   // third, parse the angle-addr:
01155   if ( !parseAngleAddr( scursor, send, maybeAddrSpec, isCRLF ) ) {
01156     return false;
01157   }
01158 
01159   if ( maybeDisplayName.isNull() ) {
01160     // check for the obsolete form of display-name (as comment):
01161     eatWhiteSpace( scursor, send );
01162     if ( scursor != send && *scursor == '(' ) {
01163       scursor++;
01164       if ( !parseComment( scursor, send, maybeDisplayName, isCRLF, true /*keep*/ ) ) {
01165         return false;
01166       }
01167     }
01168   }
01169 
01170   result.setName( maybeDisplayName );
01171   result.setAddress( maybeAddrSpec );
01172   return true;
01173 }
01174 
01175 bool parseGroup( const char* &scursor, const char * const send,
01176                  Address &result, bool isCRLF )
01177 {
01178   // group         := display-name ":" [ mailbox-list / CFWS ] ";" [CFWS]
01179   //
01180   // equivalent to:
01181   // group   := display-name ":" [ obs-mbox-list ] ";"
01182 
01183   eatCFWS( scursor, send, isCRLF );
01184   if ( scursor == send ) {
01185     return false;
01186   }
01187 
01188   // get display-name:
01189   QString maybeDisplayName;
01190   if ( !parsePhrase( scursor, send, maybeDisplayName, isCRLF ) ) {
01191     return false;
01192   }
01193 
01194   // get ":":
01195   eatCFWS( scursor, send, isCRLF );
01196   if ( scursor == send || *scursor != ':' ) {
01197     return false;
01198   }
01199 
01200   // KDE5 TODO: Don't expose displayName as public, but rather add setter for it that
01201   //            automatically calls removeBidiControlChars
01202   result.displayName = removeBidiControlChars( maybeDisplayName );
01203 
01204   // get obs-mbox-list (may contain empty entries):
01205   scursor++;
01206   while ( scursor != send ) {
01207     eatCFWS( scursor, send, isCRLF );
01208     if ( scursor == send ) {
01209       return false;
01210     }
01211 
01212     // empty entry:
01213     if ( *scursor == ',' ) {
01214       scursor++;
01215       continue;
01216     }
01217 
01218     // empty entry ending the list:
01219     if ( *scursor == ';' ) {
01220       scursor++;
01221       return true;
01222     }
01223 
01224     Mailbox maybeMailbox;
01225     if ( !parseMailbox( scursor, send, maybeMailbox, isCRLF ) ) {
01226       return false;
01227     }
01228     result.mailboxList.append( maybeMailbox );
01229 
01230     eatCFWS( scursor, send, isCRLF );
01231     // premature end:
01232     if ( scursor == send ) {
01233       return false;
01234     }
01235     // regular end of the list:
01236     if ( *scursor == ';' ) {
01237       scursor++;
01238       return true;
01239     }
01240     // eat regular list entry separator:
01241     if ( *scursor == ',' ) {
01242       scursor++;
01243     }
01244   }
01245   return false;
01246 }
01247 
01248 bool parseAddress( const char* &scursor, const char * const send,
01249                    Address &result, bool isCRLF )
01250 {
01251   // address       := mailbox / group
01252 
01253   eatCFWS( scursor, send, isCRLF );
01254   if ( scursor == send ) {
01255     return false;
01256   }
01257 
01258   // first try if it's a single mailbox:
01259   Mailbox maybeMailbox;
01260   const char * oldscursor = scursor;
01261   if ( parseMailbox( scursor, send, maybeMailbox, isCRLF ) ) {
01262     // yes, it is:
01263     result.displayName.clear();
01264     result.mailboxList.append( maybeMailbox );
01265     return true;
01266   }
01267   scursor = oldscursor;
01268 
01269   Address maybeAddress;
01270 
01271   // no, it's not a single mailbox. Try if it's a group:
01272   if ( !parseGroup( scursor, send, maybeAddress, isCRLF ) ) {
01273     return false;
01274   }
01275 
01276   result = maybeAddress;
01277   return true;
01278 }
01279 
01280 bool parseAddressList( const char* &scursor, const char * const send,
01281                        AddressList &result, bool isCRLF )
01282 {
01283   while ( scursor != send ) {
01284     eatCFWS( scursor, send, isCRLF );
01285     // end of header: this is OK.
01286     if ( scursor == send ) {
01287       return true;
01288     }
01289     // empty entry: ignore:
01290     if ( *scursor == ',' ) {
01291       scursor++;
01292       continue;
01293     }
01294     // broken clients might use ';' as list delimiter, accept that as well
01295     if ( *scursor == ';' ) {
01296       scursor++;
01297       continue;
01298     }
01299 
01300     // parse one entry
01301     Address maybeAddress;
01302     if ( !parseAddress( scursor, send, maybeAddress, isCRLF ) ) {
01303       return false;
01304     }
01305     result.append( maybeAddress );
01306 
01307     eatCFWS( scursor, send, isCRLF );
01308     // end of header: this is OK.
01309     if ( scursor == send ) {
01310       return true;
01311     }
01312     // comma separating entries: eat it.
01313     if ( *scursor == ',' ) {
01314       scursor++;
01315     }
01316   }
01317   return true;
01318 }
01319 
01320 static QString asterisk = QString::fromLatin1( "*0*", 1 );
01321 static QString asteriskZero = QString::fromLatin1( "*0*", 2 );
01322 //static QString asteriskZeroAsterisk = QString::fromLatin1( "*0*", 3 );
01323 
01324 bool parseParameter( const char* &scursor, const char * const send,
01325                      QPair<QString,QStringOrQPair> &result, bool isCRLF )
01326 {
01327   // parameter = regular-parameter / extended-parameter
01328   // regular-parameter = regular-parameter-name "=" value
01329   // extended-parameter =
01330   // value = token / quoted-string
01331   //
01332   // note that rfc2231 handling is out of the scope of this function.
01333   // Therefore we return the attribute as QString and the value as
01334   // (start,length) tupel if we see that the value is encoded
01335   // (trailing asterisk), for parseParameterList to decode...
01336 
01337   eatCFWS( scursor, send, isCRLF );
01338   if ( scursor == send ) {
01339     return false;
01340   }
01341 
01342   //
01343   // parse the parameter name:
01344   //
01345   QString maybeAttribute;
01346   if ( !parseToken( scursor, send, maybeAttribute, false /* no 8bit */ ) ) {
01347     return false;
01348   }
01349 
01350   eatCFWS( scursor, send, isCRLF );
01351   // premature end: not OK (haven't seen '=' yet).
01352   if ( scursor == send || *scursor != '=' ) {
01353     return false;
01354   }
01355   scursor++; // eat '='
01356 
01357   eatCFWS( scursor, send, isCRLF );
01358   if ( scursor == send ) {
01359     // don't choke on attribute=, meaning the value was omitted:
01360     if ( maybeAttribute.endsWith( asterisk ) ) {
01361       KMIME_WARN << "attribute ends with \"*\", but value is empty!"
01362         "Chopping away \"*\".";
01363       maybeAttribute.truncate( maybeAttribute.length() - 1 );
01364     }
01365     result = qMakePair( maybeAttribute.toLower(), QStringOrQPair() );
01366     return true;
01367   }
01368 
01369   const char * oldscursor = scursor;
01370 
01371   //
01372   // parse the parameter value:
01373   //
01374   QStringOrQPair maybeValue;
01375   if ( *scursor == '"' ) {
01376     // value is a quoted-string:
01377     scursor++;
01378     if ( maybeAttribute.endsWith( asterisk ) ) {
01379       // attributes ending with "*" designate extended-parameters,
01380       // which cannot have quoted-strings as values. So we remove the
01381       // trailing "*" to not confuse upper layers.
01382       KMIME_WARN << "attribute ends with \"*\", but value is a quoted-string!"
01383         "Chopping away \"*\".";
01384       maybeAttribute.truncate( maybeAttribute.length() - 1 );
01385     }
01386 
01387     if ( !parseGenericQuotedString( scursor, send, maybeValue.qstring, isCRLF ) ) {
01388       scursor = oldscursor;
01389       result = qMakePair( maybeAttribute.toLower(), QStringOrQPair() );
01390       return false; // this case needs further processing by upper layers!!
01391     }
01392   } else {
01393     // value is a token:
01394     if ( !parseToken( scursor, send, maybeValue.qpair, false /* no 8bit */ ) ) {
01395       scursor = oldscursor;
01396       result = qMakePair( maybeAttribute.toLower(), QStringOrQPair() );
01397       return false; // this case needs further processing by upper layers!!
01398     }
01399   }
01400 
01401   result = qMakePair( maybeAttribute.toLower(), maybeValue );
01402   return true;
01403 }
01404 
01405 bool parseRawParameterList( const char* &scursor, const char * const send,
01406                             QMap<QString,QStringOrQPair> &result,
01407                             bool isCRLF )
01408 {
01409   // we use parseParameter() consecutively to obtain a map of raw
01410   // attributes to raw values. "Raw" here means that we don't do
01411   // rfc2231 decoding and concatenation. This is left to
01412   // parseParameterList(), which will call this function.
01413   //
01414   // The main reason for making this chunk of code a separate
01415   // (private) method is that we can deal with broken parameters
01416   // _here_ and leave the rfc2231 handling solely to
01417   // parseParameterList(), which will still be enough work.
01418 
01419   while ( scursor != send ) {
01420     eatCFWS( scursor, send, isCRLF );
01421     // empty entry ending the list: OK.
01422     if ( scursor == send ) {
01423       return true;
01424     }
01425     // empty list entry: ignore.
01426     if ( *scursor == ';' ) {
01427       scursor++;
01428       continue;
01429     }
01430 
01431     QPair<QString,QStringOrQPair> maybeParameter;
01432     if ( !parseParameter( scursor, send, maybeParameter, isCRLF ) ) {
01433       // we need to do a bit of work if the attribute is not
01434       // NULL. These are the cases marked with "needs further
01435       // processing" in parseParameter(). Specifically, parsing of the
01436       // token or the quoted-string, which should represent the value,
01437       // failed. We take the easy way out and simply search for the
01438       // next ';' to start parsing again. (Another option would be to
01439       // take the text between '=' and ';' as value)
01440       if ( maybeParameter.first.isNull() ) {
01441         return false;
01442       }
01443       while ( scursor != send ) {
01444         if ( *scursor++ == ';' ) {
01445           goto IS_SEMICOLON;
01446         }
01447       }
01448       // scursor == send case: end of list.
01449       return true;
01450     IS_SEMICOLON:
01451       // *scursor == ';' case: parse next entry.
01452       continue;
01453     }
01454     // successful parsing brings us here:
01455     result.insert( maybeParameter.first, maybeParameter.second );
01456 
01457     eatCFWS( scursor, send, isCRLF );
01458     // end of header: ends list.
01459     if ( scursor == send ) {
01460       return true;
01461     }
01462     // regular separator: eat it.
01463     if ( *scursor == ';' ) {
01464       scursor++;
01465     }
01466   }
01467   return true;
01468 }
01469 
01470 static void decodeRFC2231Value( Codec* &rfc2231Codec,
01471                                 QTextCodec* &textcodec,
01472                                 bool isContinuation, QString &value,
01473                                 QPair<const char*,int> &source )
01474 {
01475   //
01476   // parse the raw value into (charset,language,text):
01477   //
01478 
01479   const char * decBegin = source.first;
01480   const char * decCursor = decBegin;
01481   const char * decEnd = decCursor + source.second;
01482 
01483   if ( !isContinuation ) {
01484     // find the first single quote
01485     while ( decCursor != decEnd ) {
01486       if ( *decCursor == '\'' ) {
01487         break;
01488       } else {
01489         decCursor++;
01490       }
01491     }
01492 
01493     if ( decCursor == decEnd ) {
01494       // there wasn't a single single quote at all!
01495       // take the whole value to be in latin-1:
01496       KMIME_WARN << "No charset in extended-initial-value."
01497         "Assuming \"iso-8859-1\".";
01498       value += QString::fromLatin1( decBegin, source.second );
01499       return;
01500     }
01501 
01502     QByteArray charset( decBegin, decCursor - decBegin );
01503 
01504     const char * oldDecCursor = ++decCursor;
01505     // find the second single quote (we ignore the language tag):
01506     while ( decCursor != decEnd ) {
01507       if ( *decCursor == '\'' ) {
01508         break;
01509       } else {
01510         decCursor++;
01511       }
01512     }
01513     if ( decCursor == decEnd ) {
01514       KMIME_WARN << "No language in extended-initial-value."
01515         "Trying to recover.";
01516       decCursor = oldDecCursor;
01517     } else {
01518       decCursor++;
01519     }
01520 
01521     // decCursor now points to the start of the
01522     // "extended-other-values":
01523 
01524     //
01525     // get the decoders:
01526     //
01527 
01528     bool matchOK = false;
01529     textcodec = KGlobal::charsets()->codecForName( charset, matchOK );
01530     if ( !matchOK ) {
01531       textcodec = 0;
01532       KMIME_WARN_UNKNOWN( Charset, charset );
01533     }
01534   }
01535 
01536   if ( !rfc2231Codec ) {
01537     rfc2231Codec = Codec::codecForName("x-kmime-rfc2231");
01538     assert( rfc2231Codec );
01539   }
01540 
01541   if ( !textcodec ) {
01542     value += QString::fromLatin1( decCursor, decEnd - decCursor );
01543     return;
01544   }
01545 
01546   Decoder * dec = rfc2231Codec->makeDecoder();
01547   assert( dec );
01548 
01549   //
01550   // do the decoding:
01551   //
01552 
01553   QByteArray buffer;
01554   buffer.resize( rfc2231Codec->maxDecodedSizeFor( decEnd - decCursor ) );
01555   QByteArray::Iterator bit = buffer.begin();
01556   QByteArray::ConstIterator bend = buffer.end();
01557 
01558   if ( !dec->decode( decCursor, decEnd, bit, bend ) ) {
01559     KMIME_WARN << rfc2231Codec->name()
01560                << "codec lies about its maxDecodedSizeFor()" << endl
01561                << "result may be truncated";
01562   }
01563 
01564   value += textcodec->toUnicode( buffer.begin(), bit - buffer.begin() );
01565 
01566   // kDebug() << "value now: \"" << value << "\"";
01567   // cleanup:
01568   delete dec;
01569 }
01570 
01571 // known issues:
01572 //  - permutes rfc2231 continuations when the total number of parts
01573 //    exceeds 10 (other-sections then becomes *xy, ie. two digits)
01574 
01575 bool parseParameterList( const char* &scursor, const char * const send,
01576                          QMap<QString,QString> &result, bool isCRLF )
01577 {
01578   // parse the list into raw attribute-value pairs:
01579   QMap<QString,QStringOrQPair> rawParameterList;
01580   if (!parseRawParameterList( scursor, send, rawParameterList, isCRLF ) ) {
01581     return false;
01582   }
01583 
01584   if ( rawParameterList.isEmpty() ) {
01585     return true;
01586   }
01587 
01588   // decode rfc 2231 continuations and alternate charset encoding:
01589 
01590   // NOTE: this code assumes that what QMapIterator delivers is sorted
01591   // by the key!
01592 
01593   Codec * rfc2231Codec = 0;
01594   QTextCodec * textcodec = 0;
01595   QString attribute;
01596   QString value;
01597   enum Modes {
01598     NoMode = 0x0, Continued = 0x1, Encoded = 0x2
01599   } mode;
01600 
01601   QMap<QString,QStringOrQPair>::Iterator it, end = rawParameterList.end();
01602 
01603   for ( it = rawParameterList.begin() ; it != end ; ++it ) {
01604     if ( attribute.isNull() || !it.key().startsWith( attribute ) ) {
01605       //
01606       // new attribute:
01607       //
01608 
01609       // store the last attribute/value pair in the result map now:
01610       if ( !attribute.isNull() ) {
01611         result.insert( attribute, value );
01612       }
01613       // and extract the information from the new raw attribute:
01614       value.clear();
01615       attribute = it.key();
01616       mode = NoMode;
01617       // is the value encoded?
01618       if ( attribute.endsWith( asterisk ) ) {
01619         attribute.truncate( attribute.length() - 1 );
01620         mode = (Modes) ((int) mode | Encoded);
01621       }
01622       // is the value continued?
01623       if ( attribute.endsWith( asteriskZero ) ) {
01624         attribute.truncate( attribute.length() - 2 );
01625         mode = (Modes) ((int) mode | Continued);
01626       }
01627       //
01628       // decode if necessary:
01629       //
01630       if ( mode & Encoded ) {
01631         decodeRFC2231Value( rfc2231Codec, textcodec,
01632                             false, /* isn't continuation */
01633                             value, (*it).qpair );
01634       } else {
01635         // not encoded.
01636         if ( (*it).qpair.first ) {
01637           value += QString::fromLatin1( (*it).qpair.first, (*it).qpair.second );
01638         } else {
01639           value += (*it).qstring;
01640         }
01641       }
01642 
01643       //
01644       // shortcut-processing when the value isn't encoded:
01645       //
01646 
01647       if ( !(mode & Continued) ) {
01648         // save result already:
01649         result.insert( attribute, value );
01650         // force begin of a new attribute:
01651         attribute.clear();
01652       }
01653     } else { // it.key().startsWith( attribute )
01654       //
01655       // continuation
01656       //
01657 
01658       // ignore the section and trust QMap to have sorted the keys:
01659       if ( it.key().endsWith( asterisk ) ) {
01660         // encoded
01661         decodeRFC2231Value( rfc2231Codec, textcodec,
01662                             true, /* is continuation */
01663                             value, (*it).qpair );
01664       } else {
01665         // not encoded
01666         if ( (*it).qpair.first ) {
01667           value += QString::fromLatin1( (*it).qpair.first, (*it).qpair.second );
01668         } else {
01669           value += (*it).qstring;
01670         }
01671       }
01672     }
01673   }
01674 
01675   // write last attr/value pair:
01676   if ( !attribute.isNull() ) {
01677     result.insert( attribute, value );
01678   }
01679 
01680   return true;
01681 }
01682 
01683 static const char * const stdDayNames[] = {
01684   "Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"
01685 };
01686 static const int stdDayNamesLen = sizeof stdDayNames / sizeof *stdDayNames;
01687 
01688 static bool parseDayName( const char* &scursor, const char * const send )
01689 {
01690   // check bounds:
01691   if ( send - scursor < 3 ) {
01692     return false;
01693   }
01694 
01695   for ( int i = 0 ; i < stdDayNamesLen ; ++i ) {
01696     if ( qstrnicmp( scursor, stdDayNames[i], 3 ) == 0 ) {
01697       scursor += 3;
01698       // kDebug() << "found" << stdDayNames[i];
01699       return true;
01700     }
01701   }
01702 
01703   return false;
01704 }
01705 
01706 static const char * const stdMonthNames[] = {
01707   "Jan", "Feb", "Mar", "Apr", "May", "Jun",
01708   "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"
01709 };
01710 static const int stdMonthNamesLen =
01711                               sizeof stdMonthNames / sizeof *stdMonthNames;
01712 
01713 static bool parseMonthName( const char* &scursor, const char * const send,
01714                             int &result )
01715 {
01716   // check bounds:
01717   if ( send - scursor < 3 ) {
01718     return false;
01719   }
01720 
01721   for ( result = 0 ; result < stdMonthNamesLen ; ++result ) {
01722     if ( qstrnicmp( scursor, stdMonthNames[result], 3 ) == 0 ) {
01723       scursor += 3;
01724       return true;
01725     }
01726   }
01727 
01728   // not found:
01729   return false;
01730 }
01731 
01732 static const struct {
01733   const char * tzName;
01734   long int secsEastOfGMT;
01735 } timeZones[] = {
01736   // rfc 822 timezones:
01737   { "GMT", 0 },
01738   { "UT", 0 },
01739   { "EDT", -4*3600 },
01740   { "EST", -5*3600 },
01741   { "MST", -5*3600 },
01742   { "CST", -6*3600 },
01743   { "MDT", -6*3600 },
01744   { "MST", -7*3600 },
01745   { "PDT", -7*3600 },
01746   { "PST", -8*3600 },
01747   // common, non-rfc-822 zones:
01748   { "CET", 1*3600 },
01749   { "MET", 1*3600 },
01750   { "UTC", 0 },
01751   { "CEST", 2*3600 },
01752   { "BST", 1*3600 },
01753   // rfc 822 military timezones:
01754   { "Z", 0 },
01755   { "A", -1*3600 },
01756   { "B", -2*3600 },
01757   { "C", -3*3600 },
01758   { "D", -4*3600 },
01759   { "E", -5*3600 },
01760   { "F", -6*3600 },
01761   { "G", -7*3600 },
01762   { "H", -8*3600 },
01763   { "I", -9*3600 },
01764   // J is not used!
01765   { "K", -10*3600 },
01766   { "L", -11*3600 },
01767   { "M", -12*3600 },
01768   { "N", 1*3600 },
01769   { "O", 2*3600 },
01770   { "P", 3*3600 },
01771   { "Q", 4*3600 },
01772   { "R", 5*3600 },
01773   { "S", 6*3600 },
01774   { "T", 7*3600 },
01775   { "U", 8*3600 },
01776   { "V", 9*3600 },
01777   { "W", 10*3600 },
01778   { "X", 11*3600 },
01779   { "Y", 12*3600 },
01780 };
01781 static const int timeZonesLen = sizeof timeZones / sizeof *timeZones;
01782 
01783 static bool parseAlphaNumericTimeZone( const char* &scursor,
01784                                        const char * const send,
01785                                        long int &secsEastOfGMT,
01786                                        bool &timeZoneKnown )
01787 {
01788   QPair<const char*,int> maybeTimeZone( 0, 0 );
01789   if ( !parseToken( scursor, send, maybeTimeZone, false /*no 8bit*/ ) ) {
01790     return false;
01791   }
01792   for ( int i = 0 ; i < timeZonesLen ; ++i ) {
01793     if ( qstrnicmp( timeZones[i].tzName,
01794                     maybeTimeZone.first, maybeTimeZone.second ) == 0 ) {
01795       scursor += maybeTimeZone.second;
01796       secsEastOfGMT = timeZones[i].secsEastOfGMT;
01797       timeZoneKnown = true;
01798       return true;
01799     }
01800   }
01801 
01802   // don't choke just because we don't happen to know the time zone
01803   KMIME_WARN_UNKNOWN( time zone,
01804                       QByteArray( maybeTimeZone.first, maybeTimeZone.second ) );
01805   secsEastOfGMT = 0;
01806   timeZoneKnown = false;
01807   return true;
01808 }
01809 
01810 // parse a number and return the number of digits parsed:
01811 int parseDigits( const char* &scursor, const char * const send, int &result )
01812 {
01813   result = 0;
01814   int digits = 0;
01815   for ( ; scursor != send && isdigit( *scursor ) ; scursor++, digits++ ) {
01816     result *= 10;
01817     result += int( *scursor - '0' );
01818   }
01819   return digits;
01820 }
01821 
01822 static bool parseTimeOfDay( const char* &scursor, const char * const send,
01823                             int &hour, int &min, int &sec, bool isCRLF=false )
01824 {
01825   // time-of-day := 2DIGIT [CFWS] ":" [CFWS] 2DIGIT [ [CFWS] ":" 2DIGIT ]
01826 
01827   //
01828   // 2DIGIT representing "hour":
01829   //
01830   if ( !parseDigits( scursor, send, hour ) ) {
01831     return false;
01832   }
01833 
01834   eatCFWS( scursor, send, isCRLF );
01835   if ( scursor == send || *scursor != ':' ) {
01836     return false;
01837   }
01838   scursor++; // eat ':'
01839 
01840   eatCFWS( scursor, send, isCRLF );
01841   if ( scursor == send ) {
01842     return false;
01843   }
01844 
01845   //
01846   // 2DIGIT representing "minute":
01847   //
01848   if ( !parseDigits( scursor, send, min ) ) {
01849     return false;
01850   }
01851 
01852   eatCFWS( scursor, send, isCRLF );
01853   if ( scursor == send ) {
01854     return true; // seconds are optional
01855   }
01856 
01857   //
01858   // let's see if we have a 2DIGIT representing "second":
01859   //
01860   if ( *scursor == ':' ) {
01861     // yepp, there are seconds:
01862     scursor++; // eat ':'
01863     eatCFWS( scursor, send, isCRLF );
01864     if ( scursor == send ) {
01865       return false;
01866     }
01867 
01868     if ( !parseDigits( scursor, send, sec ) ) {
01869       return false;
01870     }
01871   } else {
01872     sec = 0;
01873   }
01874 
01875   return true;
01876 }
01877 
01878 bool parseTime( const char* &scursor, const char * send,
01879                 int &hour, int &min, int &sec, long int &secsEastOfGMT,
01880                 bool &timeZoneKnown, bool isCRLF )
01881 {
01882   // time := time-of-day CFWS ( zone / obs-zone )
01883   //
01884   // obs-zone    := "UT" / "GMT" /
01885   //                "EST" / "EDT" / ; -0500 / -0400
01886   //                "CST" / "CDT" / ; -0600 / -0500
01887   //                "MST" / "MDT" / ; -0700 / -0600
01888   //                "PST" / "PDT" / ; -0800 / -0700
01889   //                "A"-"I" / "a"-"i" /
01890   //                "K"-"Z" / "k"-"z"
01891 
01892   eatCFWS( scursor, send, isCRLF );
01893   if ( scursor == send ) {
01894     return false;
01895   }
01896 
01897   if ( !parseTimeOfDay( scursor, send, hour, min, sec, isCRLF ) ) {
01898     return false;
01899   }
01900 
01901   eatCFWS( scursor, send, isCRLF );
01902   if ( scursor == send ) {
01903     timeZoneKnown = false;
01904     secsEastOfGMT = 0;
01905     return true; // allow missing timezone
01906   }
01907 
01908   timeZoneKnown = true;
01909   if ( *scursor == '+' || *scursor == '-' ) {
01910     // remember and eat '-'/'+':
01911     const char sign = *scursor++;
01912     // numerical timezone:
01913     int maybeTimeZone;
01914     if ( parseDigits( scursor, send, maybeTimeZone ) != 4 ) {
01915       return false;
01916     }
01917     secsEastOfGMT = 60 * ( maybeTimeZone / 100 * 60 + maybeTimeZone % 100 );
01918     if ( sign == '-' ) {
01919       secsEastOfGMT *= -1;
01920       if ( secsEastOfGMT == 0 ) {
01921         timeZoneKnown = false; // -0000 means indetermined tz
01922       }
01923     }
01924   } else {
01925     // maybe alphanumeric timezone:
01926     if ( !parseAlphaNumericTimeZone( scursor, send, secsEastOfGMT, timeZoneKnown ) ) {
01927       return false;
01928     }
01929   }
01930   return true;
01931 }
01932 
01933 bool parseDateTime( const char* &scursor, const char * const send,
01934                     KDateTime &result, bool isCRLF )
01935 {
01936   // Parsing date-time; strict mode:
01937   //
01938   // date-time   := [ [CFWS] day-name [CFWS] "," ]                      ; wday
01939   // (expanded)     [CFWS] 1*2DIGIT CFWS month-name CFWS 2*DIGIT [CFWS] ; date
01940   //                time
01941   //
01942   // day-name    := "Mon" / "Tue" / "Wed" / "Thu" / "Fri" / "Sat" / "Sun"
01943   // month-name  := "Jan" / "Feb" / "Mar" / "Apr" / "May" / "Jun" /
01944   //                "Jul" / "Aug" / "Sep" / "Oct" / "Nov" / "Dec"
01945 
01946   result = KDateTime();
01947   QDateTime maybeDateTime;
01948 
01949   eatCFWS( scursor, send, isCRLF );
01950   if ( scursor == send ) {
01951     return false;
01952   }
01953 
01954   //
01955   // let's see if there's a day-of-week:
01956   //
01957   if ( parseDayName( scursor, send ) ) {
01958     eatCFWS( scursor, send, isCRLF );
01959     if ( scursor == send ) {
01960       return false;
01961     }
01962     // day-name should be followed by ',' but we treat it as optional:
01963     if ( *scursor == ',' ) {
01964       scursor++; // eat ','
01965       eatCFWS( scursor, send, isCRLF );
01966     }
01967   }
01968 
01969   //
01970   // 1*2DIGIT representing "day" (of month):
01971   //
01972   int maybeDay;
01973   if ( !parseDigits( scursor, send, maybeDay ) ) {
01974     return false;
01975   }
01976 
01977   eatCFWS( scursor, send, isCRLF );
01978   if ( scursor == send ) {
01979     return false;
01980   }
01981 
01982   //
01983   // month-name:
01984   //
01985   int maybeMonth = 0;
01986   if ( !parseMonthName( scursor, send, maybeMonth ) ) {
01987     return false;
01988   }
01989   if ( scursor == send ) {
01990     return false;
01991   }
01992   assert( maybeMonth >= 0 ); assert( maybeMonth <= 11 );
01993   ++maybeMonth; // 0-11 -> 1-12
01994 
01995   eatCFWS( scursor, send, isCRLF );
01996   if ( scursor == send ) {
01997     return false;
01998   }
01999 
02000   //
02001   // 2*DIGIT representing "year":
02002   //
02003   int maybeYear;
02004   if ( !parseDigits( scursor, send, maybeYear ) ) {
02005     return false;
02006   }
02007   // RFC 2822 4.3 processing:
02008   if ( maybeYear < 50 ) {
02009     maybeYear += 2000;
02010   } else if ( maybeYear < 1000 ) {
02011     maybeYear += 1900;
02012   }
02013   // else keep as is
02014   if ( maybeYear < 1900 ) {
02015     return false; // rfc2822, 3.3
02016   }
02017 
02018   eatCFWS( scursor, send, isCRLF );
02019   if ( scursor == send ) {
02020     return false;
02021   }
02022 
02023   maybeDateTime.setDate( QDate( maybeYear, maybeMonth, maybeDay ) );
02024 
02025   //
02026   // time
02027   //
02028   int maybeHour, maybeMinute, maybeSecond;
02029   long int secsEastOfGMT;
02030   bool timeZoneKnown = true;
02031 
02032   if ( !parseTime( scursor, send,
02033                    maybeHour, maybeMinute, maybeSecond,
02034                    secsEastOfGMT, timeZoneKnown, isCRLF ) ) {
02035     return false;
02036   }
02037 
02038   maybeDateTime.setTime( QTime( maybeHour, maybeMinute, maybeSecond ) );
02039   if ( !maybeDateTime.isValid() )
02040     return false;
02041 
02042   result = KDateTime( maybeDateTime, KDateTime::Spec( KDateTime::OffsetFromUTC, secsEastOfGMT ) );
02043   if ( !result.isValid() )
02044     return false;
02045   return true;
02046 }
02047 
02048 Headers::Base *extractFirstHeader( QByteArray &head )
02049 {
02050   int endOfFieldBody = 0;
02051   int len = head.length() - 1;
02052   bool folded = false;
02053   Headers::Base *header = 0;
02054 
02055   int startOfFieldBody = head.indexOf( ":" );
02056   const int endOfFieldHeader = startOfFieldBody;
02057 
02058   if ( startOfFieldBody > -1 ) {    //there is another header
02059     startOfFieldBody++; //skip the ':'
02060     if ( head[startOfFieldBody] == ' ' ) { // skip the space after the ':', if there
02061       startOfFieldBody++;
02062     }
02063     endOfFieldBody = startOfFieldBody;
02064 
02065     // If the first line contains nothing, but the next line starts with a space
02066     // or a tab, that means a stupid mail client has made the first header field line
02067     // entirely empty, and has folded the rest to the next line(s).
02068     if ( head[endOfFieldBody] == '\n' && endOfFieldBody + 1 < len &&
02069          ( head[endOfFieldBody + 1] == ' ' ||
02070            head[endOfFieldBody + 1] == '\t' ) ) {
02071 
02072       // Skip \n and first whitespace
02073       startOfFieldBody += 2;
02074       endOfFieldBody += 2;
02075     }
02076     
02077     if ( head[endOfFieldBody] != '\n' ) {  // check if the header is not empty
02078       while ( 1 ) {
02079         endOfFieldBody = head.indexOf( '\n', endOfFieldBody + 1 );
02080         if ( endOfFieldBody == -1 || endOfFieldBody == len ||
02081               ( head[endOfFieldBody+1] != ' ' &&
02082                 head[endOfFieldBody+1] != '\t' ) ) {
02083           //break if we reach the end of the string, honor folded lines
02084           break;
02085         } else {
02086           folded = true;
02087         }
02088       }
02089     }
02090 
02091     if ( endOfFieldBody < 0 ) {
02092       endOfFieldBody = len + 1; //take the rest of the string
02093     }
02094 
02095     QByteArray rawType = head.left( endOfFieldHeader );
02096     QByteArray rawFieldBody = head.mid( startOfFieldBody, endOfFieldBody - startOfFieldBody );
02097     if ( folded ) {
02098       rawFieldBody = unfoldHeader( rawFieldBody );
02099     }
02100     header = HeaderFactory::self()->createHeader( rawType );
02101     if( !header ) {
02102       //kWarning() << "Returning Generic header of type" << rawType;
02103       header = new Headers::Generic( rawType );
02104     }
02105     header->from7BitString( rawFieldBody );
02106 
02107     head.remove( 0, endOfFieldBody + 1 );
02108   } else {
02109     head.clear();
02110   }
02111 
02112   return header;
02113 }
02114 
02115 Headers::Base::List parseHeaders( const QByteArray &head )
02116 {
02117   Headers::Base::List ret;
02118   Headers::Base *h;
02119 
02120   QByteArray copy = head;
02121   while( ( h = extractFirstHeader( copy ) ) ) {
02122     ret << h;
02123   }
02124 
02125   return ret;
02126 }
02127 
02128 } // namespace HeaderParsing
02129 
02130 } // namespace KMime

KMIME Library

Skip menu "KMIME Library"
  • Main Page
  • Namespace List
  • Class Hierarchy
  • Alphabetical List
  • Class List
  • File List
  • Namespace Members
  • Class Members
  • Related Pages

KDE-PIM Libraries

Skip menu "KDE-PIM Libraries"
  • akonadi
  •   contact
  •   kmime
  • kabc
  • kblog
  • kcal
  • kholidays
  • kimap
  • kioslave
  •   imap4
  •   mbox
  •   nntp
  • kldap
  • kmime
  • kontactinterface
  • kpimidentities
  • kpimtextedit
  •   richtextbuilders
  • kpimutils
  • kresources
  • ktnef
  • kxmlrpcclient
  • mailtransport
  • microblog
  • qgpgme
  • syndication
  •   atom
  •   rdf
  •   rss2
Generated for KDE-PIM Libraries by doxygen 1.6.2-20100208
This website is maintained by Adriaan de Groot and Allen Winter.
KDE® and the K Desktop Environment® logo are registered trademarks of KDE e.V. | Legal