blob: b61c50b043251648c7f9b1c36f0fed844de089e6 [file] [log] [blame] [raw]
/** @file
* IPRT - String Manipulation, Latin-1 (ISO-8859-1) encoding.
*/
/*
* Copyright (C) 2006-2015 Oracle Corporation
*
* This file is part of VirtualBox Open Source Edition (OSE), as
* available from http://www.virtualbox.org. This file is free software;
* you can redistribute it and/or modify it under the terms of the GNU
* General Public License (GPL) as published by the Free Software
* Foundation, in version 2 as it comes in the "COPYING" file of the
* VirtualBox OSE distribution. VirtualBox OSE is distributed in the
* hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
*
* The contents of this file may alternatively be used under the terms
* of the Common Development and Distribution License Version 1.0
* (CDDL) only, as it comes in the "COPYING.CDDL" file of the
* VirtualBox OSE distribution, in which case the provisions of the
* CDDL are applicable instead of those of the GPL.
*
* You may elect to license modified versions of this file under the
* terms and conditions of either the GPL or the CDDL or both.
*/
#ifndef ___iprt_latin1_h
#define ___iprt_latin1_h
#include <iprt/string.h>
RT_C_DECLS_BEGIN
/** @defgroup rt_str_latin1 Latin-1 (ISO-8859-1) String Manipulation
* @ingroup grp_rt_str
*
* Deals with Latin-1 encoded strings.
*
* @warning Make sure to name all variables dealing with Latin-1 strings
* suchthat there is no way to mistake them for normal UTF-8 strings.
* There may be severe security issues resulting from mistaking Latin-1
* for UTF-8!
*
* @{
*/
/**
* Get the unicode code point at the given string position.
*
* @returns unicode code point.
* @returns RTUNICP_INVALID if the encoding is invalid.
* @param pszLatin1 The Latin-1 string.
*/
DECLINLINE(RTUNICP) RTLatin1GetCp(const char *pszLatin1)
{
return *(const unsigned char *)pszLatin1;
}
/**
* Get the unicode code point at the given string position.
*
* @returns iprt status code.
* @param ppszLatin1 Pointer to the string pointer. This will be updated to
* point to the char following the current code point. This
* is advanced one character forward on failure.
* @param pCp Where to store the code point. RTUNICP_INVALID is stored
* here on failure.
*/
DECLINLINE(int) RTLatin1GetCpEx(const char **ppszLatin1, PRTUNICP pCp)
{
const unsigned char uch = **(const unsigned char **)ppszLatin1;
(*ppszLatin1)++;
*pCp = uch;
return VINF_SUCCESS;
}
/**
* Get the unicode code point at the given string position for a string of a
* given maximum length.
*
* @returns iprt status code.
* @retval VERR_END_OF_STRING if *pcch is 0. *pCp is set to RTUNICP_INVALID.
*
* @param ppszLatin1 Pointer to the string pointer. This will be updated to
* point to the char following the current code point.
* @param pcchLatin1 Pointer to the maximum string length. This will be
* decremented by the size of the code point found.
* @param pCp Where to store the code point.
* RTUNICP_INVALID is stored here on failure.
*/
DECLINLINE(int) RTLatin1GetCpNEx(const char **ppszLatin1, size_t *pcchLatin1, PRTUNICP pCp)
{
if (RT_LIKELY(*pcchLatin1 != 0))
{
const unsigned char uch = **(const unsigned char **)ppszLatin1;
(*ppszLatin1)++;
(*pcchLatin1)--;
*pCp = uch;
return VINF_SUCCESS;
}
*pCp = RTUNICP_INVALID;
return VERR_END_OF_STRING;
}
/**
* Get the Latin-1 size in characters of a given Unicode code point.
*
* The code point is expected to be a valid Unicode one, but not necessarily in
* the range supported by Latin-1.
*
* @returns the size in characters, or zero if there is no Latin-1 encoding
*/
DECLINLINE(size_t) RTLatin1CpSize(RTUNICP CodePoint)
{
if (CodePoint < 0x100)
return 1;
return 0;
}
/**
* Put the unicode code point at the given string position
* and return the pointer to the char following it.
*
* This function will not consider anything at or following the
* buffer area pointed to by psz. It is therefore not suitable for
* inserting code points into a string, only appending/overwriting.
*
* @returns pointer to the char following the written code point.
* @param pszLatin1 The string.
* @param CodePoint The code point to write.
* This should not be RTUNICP_INVALID or any other
* character out of the Latin-1 range.
*/
DECLINLINE(char *) RTLatin1PutCp(char *pszLatin1, RTUNICP CodePoint)
{
AssertReturn(CodePoint < 0x100, NULL);
*pszLatin1++ = (unsigned char)CodePoint;
return pszLatin1;
}
/**
* Skips ahead, past the current code point.
*
* @returns Pointer to the char after the current code point.
* @param pszLatin1 Pointer to the current code point.
* @remark This will not move the next valid code point, only past the current one.
*/
DECLINLINE(char *) RTLatin1NextCp(const char *pszLatin1)
{
pszLatin1++;
return (char *)pszLatin1;
}
/**
* Skips back to the previous code point.
*
* @returns Pointer to the char before the current code point.
* @returns pszLatin1Start on failure.
* @param pszLatin1Start Pointer to the start of the string.
* @param pszLatin1 Pointer to the current code point.
*/
DECLINLINE(char *) RTLatin1PrevCp(const char *pszLatin1Start, const char *pszLatin1)
{
if ((uintptr_t)pszLatin1 > (uintptr_t)pszLatin1Start)
{
pszLatin1--;
return (char *)pszLatin1;
}
return (char *)pszLatin1Start;
}
/**
* Translate a Latin1 string into a UTF-8 allocating the result buffer (default
* tag).
*
* @returns iprt status code.
* @param pszLatin1 Latin1 string to convert.
* @param ppszString Receives pointer of allocated UTF-8 string on
* success, and is always set to NULL on failure.
* The returned pointer must be freed using RTStrFree().
*/
#define RTLatin1ToUtf8(pszLatin1, ppszString) RTLatin1ToUtf8Tag((pszLatin1), (ppszString), RTSTR_TAG)
/**
* Translate a Latin-1 string into a UTF-8 allocating the result buffer.
*
* @returns iprt status code.
* @param pszLatin1 Latin-1 string to convert.
* @param ppszString Receives pointer of allocated UTF-8 string on
* success, and is always set to NULL on failure.
* The returned pointer must be freed using RTStrFree().
* @param pszTag Allocation tag used for statistics and such.
*/
RTDECL(int) RTLatin1ToUtf8Tag(const char *pszLatin1, char **ppszString, const char *pszTag);
/**
* Translates Latin-1 to UTF-8 using buffer provided by the caller or a fittingly
* sized buffer allocated by the function (default tag).
*
* @returns iprt status code.
* @param pszLatin1 The Latin-1 string to convert.
* @param cchLatin1 The number of Latin-1 characters to translate from
* pszLatin1. The translation will stop when reaching
* cchLatin1 or the terminator ('\\0'). Use RTSTR_MAX
* to translate the entire string.
* @param ppsz If @a cch is non-zero, this must either be pointing
* to a pointer to a buffer of the specified size, or
* pointer to a NULL pointer. If *ppsz is NULL or
* @a cch is zero a buffer of at least @a cch chars
* will be allocated to hold the translated string. If
* a buffer was requested it must be freed using
* RTStrFree().
* @param cch The buffer size in chars (the type). This includes the terminator.
* @param pcch Where to store the length of the translated string,
* excluding the terminator. (Optional)
*
* This may be set under some error conditions,
* however, only for VERR_BUFFER_OVERFLOW and
* VERR_NO_STR_MEMORY will it contain a valid string
* length that can be used to resize the buffer.
*/
#define RTLatin1ToUtf8Ex(pszLatin1, cchLatin1, ppsz, cch, pcch) \
RTLatin1ToUtf8ExTag((pszLatin1), (cchLatin1), (ppsz), (cch), (pcch), RTSTR_TAG)
/**
* Translates Latin1 to UTF-8 using buffer provided by the caller or a fittingly
* sized buffer allocated by the function (custom tag).
*
* @returns iprt status code.
* @param pszLatin1 The Latin1 string to convert.
* @param cchLatin1 The number of Latin1 characters to translate from
* pwszString. The translation will stop when
* reaching cchLatin1 or the terminator ('\\0'). Use
* RTSTR_MAX to translate the entire string.
* @param ppsz If cch is non-zero, this must either be pointing to
* a pointer to a buffer of the specified size, or
* pointer to a NULL pointer. If *ppsz is NULL or cch
* is zero a buffer of at least cch chars will be
* allocated to hold the translated string. If a
* buffer was requested it must be freed using
* RTStrFree().
* @param cch The buffer size in chars (the type). This includes
* the terminator.
* @param pcch Where to store the length of the translated string,
* excluding the terminator. (Optional)
*
* This may be set under some error conditions,
* however, only for VERR_BUFFER_OVERFLOW and
* VERR_NO_STR_MEMORY will it contain a valid string
* length that can be used to resize the buffer.
* @param pszTag Allocation tag used for statistics and such.
*/
RTDECL(int) RTLatin1ToUtf8ExTag(const char *pszLatin1, size_t cchLatin1, char **ppsz, size_t cch, size_t *pcch,
const char *pszTag);
/**
* Calculates the length of the Latin-1 string in UTF-8 chars (bytes).
*
* The primary purpose of this function is to help allocate buffers for
* RTLatin1ToUtf8() of the correct size. For most other purposes
* RTLatin1ToUtf8Ex() should be used.
*
* @returns Number of chars (bytes).
* @returns 0 if the string was incorrectly encoded.
* @param pszLatin1 The Latin-1 string.
*/
RTDECL(size_t) RTLatin1CalcUtf8Len(const char *pszLatin1);
/**
* Calculates the length of the Latin-1 string in UTF-8 chars (bytes).
*
* @returns iprt status code.
* @param pszLatin1 The Latin-1 string.
* @param cchLatin1 The max string length. Use RTSTR_MAX to process the
* entire string.
* @param pcch Where to store the string length (in bytes). Optional.
* This is undefined on failure.
*/
RTDECL(int) RTLatin1CalcUtf8LenEx(const char *pszLatin1, size_t cchLatin1, size_t *pcch);
/**
* Calculates the length of the Latin-1 (ISO-8859-1) string in RTUTF16 items.
*
* @returns Number of RTUTF16 items.
* @param pszLatin1 The Latin-1 string.
*/
RTDECL(size_t) RTLatin1CalcUtf16Len(const char *pszLatin1);
/**
* Calculates the length of the Latin-1 (ISO-8859-1) string in RTUTF16 items.
*
* @returns iprt status code.
* @param pszLatin1 The Latin-1 string.
* @param cchLatin1 The max string length. Use RTSTR_MAX to process the
* entire string.
* @param pcwc Where to store the string length. Optional.
* This is undefined on failure.
*/
RTDECL(int) RTLatin1CalcUtf16LenEx(const char *pszLatin1, size_t cchLatin1, size_t *pcwc);
/**
* Translate a Latin-1 (ISO-8859-1) string into a UTF-16 allocating the result
* buffer (default tag).
*
* @returns iprt status code.
* @param pszLatin1 The Latin-1 string to convert.
* @param ppwszString Receives pointer to the allocated UTF-16 string. The
* returned string must be freed using RTUtf16Free().
*/
#define RTLatin1ToUtf16(pszLatin1, ppwszString) RTLatin1ToUtf16Tag((pszLatin1), (ppwszString), RTSTR_TAG)
/**
* Translate a Latin-1 (ISO-8859-1) string into a UTF-16 allocating the result
* buffer (custom tag).
*
* @returns iprt status code.
* @param pszLatin1 The Latin-1 string to convert.
* @param ppwszString Receives pointer to the allocated UTF-16 string. The
* returned string must be freed using RTUtf16Free().
* @param pszTag Allocation tag used for statistics and such.
*/
RTDECL(int) RTLatin1ToUtf16Tag(const char *pszLatin1, PRTUTF16 *ppwszString, const char *pszTag);
/**
* Translates pszLatin1 from Latin-1 (ISO-8859-1) to UTF-16, allocating the
* result buffer if requested (default tag).
*
* @returns iprt status code.
* @param pszLatin1 The Latin-1 string to convert.
* @param cchLatin1 The maximum size in chars (the type) to convert. The
* conversion stops when it reaches cchLatin1 or the
* string terminator ('\\0'). Use RTSTR_MAX to
* translate the entire string.
* @param ppwsz If cwc is non-zero, this must either be pointing
* to pointer to a buffer of the specified size, or
* pointer to a NULL pointer.
* If *ppwsz is NULL or cwc is zero a buffer of at
* least cwc items will be allocated to hold the
* translated string. If a buffer was requested it
* must be freed using RTUtf16Free().
* @param cwc The buffer size in RTUTF16s. This includes the
* terminator.
* @param pcwc Where to store the length of the translated string,
* excluding the terminator. (Optional)
*
* This may be set under some error conditions,
* however, only for VERR_BUFFER_OVERFLOW and
* VERR_NO_STR_MEMORY will it contain a valid string
* length that can be used to resize the buffer.
*/
#define RTLatin1ToUtf16Ex(pszLatin1, cchLatin1, ppwsz, cwc, pcwc) \
RTLatin1ToUtf16ExTag((pszLatin1), (cchLatin1), (ppwsz), (cwc), (pcwc), RTSTR_TAG)
/**
* Translates pszLatin1 from Latin-1 (ISO-8859-1) to UTF-16, allocating the
* result buffer if requested.
*
* @returns iprt status code.
* @param pszLatin1 The Latin-1 string to convert.
* @param cchLatin1 The maximum size in chars (the type) to convert. The
* conversion stops when it reaches cchLatin1 or the
* string terminator ('\\0'). Use RTSTR_MAX to
* translate the entire string.
* @param ppwsz If cwc is non-zero, this must either be pointing
* to pointer to a buffer of the specified size, or
* pointer to a NULL pointer.
* If *ppwsz is NULL or cwc is zero a buffer of at
* least cwc items will be allocated to hold the
* translated string. If a buffer was requested it
* must be freed using RTUtf16Free().
* @param cwc The buffer size in RTUTF16s. This includes the
* terminator.
* @param pcwc Where to store the length of the translated string,
* excluding the terminator. (Optional)
*
* This may be set under some error conditions,
* however, only for VERR_BUFFER_OVERFLOW and
* VERR_NO_STR_MEMORY will it contain a valid string
* length that can be used to resize the buffer.
* @param pszTag Allocation tag used for statistics and such.
*/
RTDECL(int) RTLatin1ToUtf16ExTag(const char *pszLatin1, size_t cchLatin1,
PRTUTF16 *ppwsz, size_t cwc, size_t *pcwc, const char *pszTag);
/** @} */
RT_C_DECLS_END
/** @} */
#endif