Mã hóa / Giải mã URL trong C ++ [đóng]

Question 1

Có ai biết về bất kỳ mã C ++ tốt nào làm điều này không?

Question 2

Tôi đã đối mặt với một nửa mã hóa của vấn đề này vào ngày hôm trước. Không hài lòng với các tùy chọn có sẵn và sau khi xem mã mẫu C này , tôi đã quyết định triển khai chức năng mã hóa url C ++ của riêng mình:

#include <cctype>
#include <iomanip>
#include <sstream>
#include <string>

using namespace std;

string url_encode(const string &value) {
    ostringstream escaped;
    escaped.fill('0');
    escaped << hex;

    for (string::const_iterator i = value.begin(), n = value.end(); i != n; ++i) {
        string::value_type c = (*i);

        // Keep alphanumeric and other accepted characters intact
        if (isalnum(c) || c == '-' || c == '_' || c == '.' || c == '~') {
            escaped << c;
            continue;
        }

        // Any other characters are percent-encoded
        escaped << uppercase;
        escaped << '%' << setw(2) << int((unsigned char) c);
        escaped << nouppercase;
    }

    return escaped.str();
}

Việc thực hiện các chức năng giải mã được để lại như một bài tập cho người đọc. : P

Question 3

Trả lời câu hỏi của riêng tôi ...

libcurl có curl_easy_escape để mã hóa.

Để giải mã, curl_easy_unescape

Question 4

string urlDecode(string &SRC) {
    string ret;
    char ch;
    int i, ii;
    for (i=0; i<SRC.length(); i++) {
        if (int(SRC[i])==37) {
            sscanf(SRC.substr(i+1,2).c_str(), "%x", &ii);
            ch=static_cast<char>(ii);
            ret+=ch;
            i=i+2;
        } else {
            ret+=SRC[i];
        }
    }
    return (ret);
}

không phải là tốt nhất, nhưng hoạt động tốt ;-)

Question 5

cpp-netlib có các chức năng

namespace boost {
  namespace network {
    namespace uri {    
      inline std::string decoded(const std::string &input);
      inline std::string encoded(const std::string &input);
    }
  }
}

chúng cho phép mã hóa và giải mã chuỗi URL rất dễ dàng.

Question 6

Thông thường thêm '%' vào giá trị int của một ký tự sẽ không hoạt động khi mã hóa, giá trị được cho là tương đương hex. ví dụ: '/' là '% 2F' không phải '% 47'.

Tôi nghĩ đây là giải pháp tốt nhất và ngắn gọn cho cả mã hóa và giải mã url (Không có nhiều phụ thuộc tiêu đề).

string urlEncode(string str){
    string new_str = "";
    char c;
    int ic;
    const char* chars = str.c_str();
    char bufHex[10];
    int len = strlen(chars);

    for(int i=0;i<len;i++){
        c = chars[i];
        ic = c;
        // uncomment this if you want to encode spaces with +
        /*if (c==' ') new_str += '+';   
        else */if (isalnum(c) || c == '-' || c == '_' || c == '.' || c == '~') new_str += c;
        else {
            sprintf(bufHex,"%X",c);
            if(ic < 16) 
                new_str += "%0"; 
            else
                new_str += "%";
            new_str += bufHex;
        }
    }
    return new_str;
 }

string urlDecode(string str){
    string ret;
    char ch;
    int i, ii, len = str.length();

    for (i=0; i < len; i++){
        if(str[i] != '%'){
            if(str[i] == '+')
                ret += ' ';
            else
                ret += str[i];
        }else{
            sscanf(str.substr(i + 1, 2).c_str(), "%x", &ii);
            ch = static_cast<char>(ii);
            ret += ch;
            i = i + 2;
        }
    }
    return ret;
}

Question 7

[Bật chế độ Necromancer]
Tình cờ gặp câu hỏi này khi đang tìm kiếm giải pháp nhanh, hiện đại, độc lập với nền tảng và thanh lịch. Không giống như bất kỳ điều nào ở trên, cpp-netlib sẽ là người chiến thắng nhưng nó có lỗ hổng bộ nhớ khủng khiếp trong chức năng "giải mã". Vì vậy, tôi đã nghĩ ra giải pháp khí lực / nghiệp lực của boost.

namespace bsq = boost::spirit::qi;
namespace bk = boost::spirit::karma;
bsq::int_parser<unsigned char, 16, 2, 2> hex_byte;
template <typename InputIterator>
struct unescaped_string
    : bsq::grammar<InputIterator, std::string(char const *)> {
  unescaped_string() : unescaped_string::base_type(unesc_str) {
    unesc_char.add("+", ' ');

    unesc_str = *(unesc_char | "%" >> hex_byte | bsq::char_);
  }

  bsq::rule<InputIterator, std::string(char const *)> unesc_str;
  bsq::symbols<char const, char const> unesc_char;
};

template <typename OutputIterator>
struct escaped_string : bk::grammar<OutputIterator, std::string(char const *)> {
  escaped_string() : escaped_string::base_type(esc_str) {

    esc_str = *(bk::char_("a-zA-Z0-9_.~-") | "%" << bk::right_align(2,0)[bk::hex]);
  }
  bk::rule<OutputIterator, std::string(char const *)> esc_str;
};

Cách sử dụng ở trên như sau:

std::string unescape(const std::string &input) {
  std::string retVal;
  retVal.reserve(input.size());
  typedef std::string::const_iterator iterator_type;

  char const *start = "";
  iterator_type beg = input.begin();
  iterator_type end = input.end();
  unescaped_string<iterator_type> p;

  if (!bsq::parse(beg, end, p(start), retVal))
    retVal = input;
  return retVal;
}

std::string escape(const std::string &input) {
  typedef std::back_insert_iterator<std::string> sink_type;
  std::string retVal;
  retVal.reserve(input.size() * 3);
  sink_type sink(retVal);
  char const *start = "";

  escaped_string<sink_type> g;
  if (!bk::generate(sink, g(start), input))
    retVal = input;
  return retVal;
}

[Chế độ Necromancer đang tắt]

EDIT01: sửa lỗi không đệm - đặc biệt cảm ơn Hartmut Kaiser
EDIT02: Trực tiếp trên CoLiRu

Question 8

CGICC bao gồm các phương pháp để mã hóa và giải mã url. form_urlencode và form_urldecode

Question 9

Lấy cảm hứng từ xperroni, tôi đã viết một bộ giải mã. Cảm ơn bạn cho con trỏ.

#include <iostream>
#include <sstream>
#include <string>

using namespace std;

char from_hex(char ch) {
    return isdigit(ch) ? ch - '0' : tolower(ch) - 'a' + 10;
}

string url_decode(string text) {
    char h;
    ostringstream escaped;
    escaped.fill('0');

    for (auto i = text.begin(), n = text.end(); i != n; ++i) {
        string::value_type c = (*i);

        if (c == '%') {
            if (i[1] && i[2]) {
                h = from_hex(i[1]) << 4 | from_hex(i[2]);
                escaped << h;
                i += 2;
            }
        } else if (c == '+') {
            escaped << ' ';
        } else {
            escaped << c;
        }
    }

    return escaped.str();
}

int main(int argc, char** argv) {
    string msg = "J%C3%B8rn!";
    cout << msg << endl;
    string decodemsg = url_decode(msg);
    cout << decodemsg << endl;

    return 0;
}

chỉnh sửa: Đã xóa cctype không cần thiết và bao gồm iomainip.

Question 10

Thêm thông tin tiếp theo vào khuyến nghị của Bill về việc sử dụng libcurl: đề xuất tuyệt vời và sẽ được cập nhật:
sau 3 năm, hàm curl_escape không còn được dùng nữa, vì vậy, để sử dụng trong tương lai, tốt hơn nên sử dụng curl_easy_escape .

Question 11

Tôi đã kết thúc câu hỏi này khi tìm kiếm một api để giải mã url trong ứng dụng win32 c ++. Vì câu hỏi không xác định rõ nền tảng nên giả sử cửa sổ không phải là một điều xấu.

InternetCanonicalizeUrl là API cho các chương trình windows. Thông tin thêm tại đây

        LPTSTR lpOutputBuffer = new TCHAR[1];
        DWORD dwSize = 1;
        BOOL fRes = ::InternetCanonicalizeUrl(strUrl, lpOutputBuffer, &dwSize, ICU_DECODE | ICU_NO_ENCODE);
        DWORD dwError = ::GetLastError();
        if (!fRes && dwError == ERROR_INSUFFICIENT_BUFFER)
        {
            delete lpOutputBuffer;
            lpOutputBuffer = new TCHAR[dwSize];
            fRes = ::InternetCanonicalizeUrl(strUrl, lpOutputBuffer, &dwSize, ICU_DECODE | ICU_NO_ENCODE);
            if (fRes)
            {
                //lpOutputBuffer has decoded url
            }
            else
            {
                //failed to decode
            }
            if (lpOutputBuffer !=NULL)
            {
                delete [] lpOutputBuffer;
                lpOutputBuffer = NULL;
            }
        }
        else
        {
            //some other error OR the input string url is just 1 char and was successfully decoded
        }

InternetCrackUrl ( ở đây ) dường như cũng có cờ để chỉ định xem có giải mã url hay không

Question 12

Tôi không thể tìm thấy giải mã URI / unescape ở đây cũng giải mã chuỗi 2 và 3 byte. Đóng góp phiên bản hiệu suất cao của riêng tôi, chuyển đổi trực tiếp đầu vào c sting thành một chuỗi:

#include <string>

const char HEX2DEC[55] =
{
     0, 1, 2, 3,  4, 5, 6, 7,  8, 9,-1,-1, -1,-1,-1,-1,
    -1,10,11,12, 13,14,15,-1, -1,-1,-1,-1, -1,-1,-1,-1,
    -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
    -1,10,11,12, 13,14,15
};

#define __x2d__(s) HEX2DEC[*(s)-48]
#define __x2d2__(s) __x2d__(s) << 4 | __x2d__(s+1)

std::wstring decodeURI(const char * s) {
    unsigned char b;
    std::wstring ws;
    while (*s) {
        if (*s == '%')
            if ((b = __x2d2__(s + 1)) >= 0x80) {
                if (b >= 0xE0) { // three byte codepoint
                    ws += ((b & 0b00001111) << 12) | ((__x2d2__(s + 4) & 0b00111111) << 6) | (__x2d2__(s + 7) & 0b00111111);
                    s += 9;
                }
                else { // two byte codepoint
                    ws += (__x2d2__(s + 4) & 0b00111111) | (b & 0b00000011) << 6;
                    s += 6;
                }
            }
            else { // one byte codepoints
                ws += b;
                s += 3;
            }
        else { // no %
            ws += *s;
            s++;
        }
    }
    return ws;
}

Question 13

API Windows có các chức năng UrlEscape / UrlUnescape , được xuất bởi shlwapi.dll, cho tác vụ này.

Question 14

Phiên bản này là thuần C và có thể tùy chọn chuẩn hóa đường dẫn tài nguyên. Sử dụng nó với C ++ là không bình thường:

#include <string>
#include <iostream>

int main(int argc, char** argv)
{
    const std::string src("/some.url/foo/../bar/%2e/");
    std::cout << "src=\"" << src << "\"" << std::endl;

    // either do it the C++ conformant way:
    char* dst_buf = new char[src.size() + 1];
    urldecode(dst_buf, src.c_str(), 1);
    std::string dst1(dst_buf);
    delete[] dst_buf;
    std::cout << "dst1=\"" << dst1 << "\"" << std::endl;

    // or in-place with the &[0] trick to skip the new/delete
    std::string dst2;
    dst2.resize(src.size() + 1);
    dst2.resize(urldecode(&dst2[0], src.c_str(), 1));
    std::cout << "dst2=\"" << dst2 << "\"" << std::endl;
}

Kết quả đầu ra:

src="/some.url/foo/../bar/%2e/"
dst1="/some.url/bar/"
dst2="/some.url/bar/"

Và chức năng thực tế:

#include <stddef.h>
#include <ctype.h>

/**
 * decode a percent-encoded C string with optional path normalization
 *
 * The buffer pointed to by @dst must be at least strlen(@src) bytes.
 * Decoding stops at the first character from @src that decodes to null.
 * Path normalization will remove redundant slashes and slash+dot sequences,
 * as well as removing path components when slash+dot+dot is found. It will
 * keep the root slash (if one was present) and will stop normalization
 * at the first questionmark found (so query parameters won't be normalized).
 *
 * @param dst       destination buffer
 * @param src       source buffer
 * @param normalize perform path normalization if nonzero
 * @return          number of valid characters in @dst
 * @author          Johan Lindh <johan@linkdata.se>
 * @legalese        BSD licensed (http://opensource.org/licenses/BSD-2-Clause)
 */
ptrdiff_t urldecode(char* dst, const char* src, int normalize)
{
    char* org_dst = dst;
    int slash_dot_dot = 0;
    char ch, a, b;
    do {
        ch = *src++;
        if (ch == '%' && isxdigit(a = src[0]) && isxdigit(b = src[1])) {
            if (a < 'A') a -= '0';
            else if(a < 'a') a -= 'A' - 10;
            else a -= 'a' - 10;
            if (b < 'A') b -= '0';
            else if(b < 'a') b -= 'A' - 10;
            else b -= 'a' - 10;
            ch = 16 * a + b;
            src += 2;
        }
        if (normalize) {
            switch (ch) {
            case '/':
                if (slash_dot_dot < 3) {
                    /* compress consecutive slashes and remove slash-dot */
                    dst -= slash_dot_dot;
                    slash_dot_dot = 1;
                    break;
                }
                /* fall-through */
            case '?':
                /* at start of query, stop normalizing */
                if (ch == '?')
                    normalize = 0;
                /* fall-through */
            case '\0':
                if (slash_dot_dot > 1) {
                    /* remove trailing slash-dot-(dot) */
                    dst -= slash_dot_dot;
                    /* remove parent directory if it was two dots */
                    if (slash_dot_dot == 3)
                        while (dst > org_dst && *--dst != '/')
                            /* empty body */;
                    slash_dot_dot = (ch == '/') ? 1 : 0;
                    /* keep the root slash if any */
                    if (!slash_dot_dot && dst == org_dst && *dst == '/')
                        ++dst;
                }
                break;
            case '.':
                if (slash_dot_dot == 1 || slash_dot_dot == 2) {
                    ++slash_dot_dot;
                    break;
                }
                /* fall-through */
            default:
                slash_dot_dot = 0;
            }
        }
        *dst++ = ch;
    } while(ch);
    return (dst - org_dst) - 1;
}

Question 15

các bit ngon ngọt

#include <ctype.h> // isdigit, tolower

from_hex(char ch) {
  return isdigit(ch) ? ch - '0' : tolower(ch) - 'a' + 10;
}

char to_hex(char code) {
  static char hex[] = "0123456789abcdef";
  return hex[code & 15];
}

ghi chú điều đó

char d = from_hex(hex[0]) << 4 | from_hex(hex[1]);

như trong

// %7B = '{'

char d = from_hex('7') << 4 | from_hex('B');

Question 16

Bạn có thể sử dụng hàm "g_uri_escape_string ()" được cung cấp glib.h. https://developer.gnome.org/glib/stable/glib-URI-Functions.html

#include <stdio.h>
#include <stdlib.h>
#include <glib.h>
int main() {
    char *uri = "http://www.example.com?hello world";
    char *encoded_uri = NULL;
    //as per wiki (https://en.wikipedia.org/wiki/Percent-encoding)
    char *escape_char_str = "!*'();:@&=+$,/?#[]"; 
    encoded_uri = g_uri_escape_string(uri, escape_char_str, TRUE);
    printf("[%s]\n", encoded_uri);
    free(encoded_uri);

    return 0;
}

biên dịch nó với:

gcc encoding_URI.c `pkg-config --cflags --libs glib-2.0`

Question 17

Một giải pháp khác có sẵn bằng cách sử dụng thư viện điên rồ của Facebook : folly::uriEscapevà folly::uriUnescape.

Question 18

Tôi biết câu hỏi yêu cầu một phương thức C ++, nhưng đối với những người có thể cần nó, tôi đã nghĩ ra một hàm rất ngắn trong C đơn giản để mã hóa một chuỗi. Nó không tạo một chuỗi mới, thay vào đó nó thay đổi chuỗi hiện có, nghĩa là nó phải có đủ kích thước để chứa chuỗi mới. Rất dễ dàng để theo kịp.

void urlEncode(char *string)
{
    char charToEncode;
    int posToEncode;
    while (((posToEncode=strspn(string,"1234567890ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz-_.~"))!=0) &&(posToEncode<strlen(string)))
    {
        charToEncode=string[posToEncode];
        memmove(string+posToEncode+3,string+posToEncode+1,strlen(string+posToEncode));
        string[posToEncode]='%';
        string[posToEncode+1]="0123456789ABCDEF"[charToEncode>>4];
        string[posToEncode+2]="0123456789ABCDEF"[charToEncode&0xf];
        string+=posToEncode+3;
    }
}

Question 19

bạn có thể chỉ cần sử dụng hàm AtlEscapeUrl () từ atlutil.h, chỉ cần xem qua tài liệu về cách sử dụng nó.

Question 20

Phải làm điều đó trong một dự án mà không có Boost. Vì vậy, cuối cùng đã viết của riêng tôi. Tôi sẽ chỉ đưa nó lên GitHub: https://github.com/corporatehark/LUrlParser

clParseURL URL = clParseURL::ParseURL( "https://name:pwd@github.com:80/path/res" );

if ( URL.IsValid() )
{
    cout << "Scheme    : " << URL.m_Scheme << endl;
    cout << "Host      : " << URL.m_Host << endl;
    cout << "Port      : " << URL.m_Port << endl;
    cout << "Path      : " << URL.m_Path << endl;
    cout << "Query     : " << URL.m_Query << endl;
    cout << "Fragment  : " << URL.m_Fragment << endl;
    cout << "User name : " << URL.m_UserName << endl;
    cout << "Password  : " << URL.m_Password << endl;
}