| /* |
| pathencode.c - efficient path name encoding |
| |
| Copyright 2012 Facebook |
| |
| This software may be used and distributed according to the terms of |
| the GNU General Public License, incorporated herein by reference. |
| */ |
| |
| /* |
| * An implementation of the name encoding scheme used by the fncache |
| * store. The common case is of a path < 120 bytes long, which is |
| * handled either in a single pass with no allocations or two passes |
| * with a single allocation. For longer paths, multiple passes are |
| * required. |
| */ |
| |
| #define PY_SSIZE_T_CLEAN |
| #include <Python.h> |
| #include <assert.h> |
| #include <ctype.h> |
| #include <stdlib.h> |
| #include <string.h> |
| |
| #include "util.h" |
| |
| /* state machine for the fast path */ |
| enum path_state { |
| START, /* first byte of a path component */ |
| A, /* "AUX" */ |
| AU, |
| THIRD, /* third of a 3-byte sequence, e.g. "AUX", "NUL" */ |
| C, /* "CON" or "COMn" */ |
| CO, |
| COMLPT, /* "COM" or "LPT" */ |
| COMLPTn, |
| L, |
| LP, |
| N, |
| NU, |
| P, /* "PRN" */ |
| PR, |
| LDOT, /* leading '.' */ |
| DOT, /* '.' in a non-leading position */ |
| H, /* ".h" */ |
| HGDI, /* ".hg", ".d", or ".i" */ |
| SPACE, |
| DEFAULT /* byte of a path component after the first */ |
| }; |
| |
| /* state machine for dir-encoding */ |
| enum dir_state { |
| DDOT, |
| DH, |
| DHGDI, |
| DDEFAULT |
| }; |
| |
| static inline int inset(const uint32_t bitset[], char c) |
| { |
| return bitset[((uint8_t)c) >> 5] & (1 << (((uint8_t)c) & 31)); |
| } |
| |
| static inline void charcopy(char *dest, Py_ssize_t *destlen, size_t destsize, |
| char c) |
| { |
| if (dest) { |
| assert(*destlen < destsize); |
| dest[*destlen] = c; |
| } |
| (*destlen)++; |
| } |
| |
| static inline void memcopy(char *dest, Py_ssize_t *destlen, size_t destsize, |
| const void *src, Py_ssize_t len) |
| { |
| if (dest) { |
| assert(*destlen + len < destsize); |
| memcpy((void *)&dest[*destlen], src, len); |
| } |
| *destlen += len; |
| } |
| |
| static inline void hexencode(char *dest, Py_ssize_t *destlen, size_t destsize, |
| uint8_t c) |
| { |
| static const char hexdigit[] = "0123456789abcdef"; |
| |
| charcopy(dest, destlen, destsize, hexdigit[c >> 4]); |
| charcopy(dest, destlen, destsize, hexdigit[c & 15]); |
| } |
| |
| /* 3-byte escape: tilde followed by two hex digits */ |
| static inline void escape3(char *dest, Py_ssize_t *destlen, size_t destsize, |
| char c) |
| { |
| charcopy(dest, destlen, destsize, '~'); |
| hexencode(dest, destlen, destsize, c); |
| } |
| |
| static Py_ssize_t _encodedir(char *dest, size_t destsize, |
| const char *src, Py_ssize_t len) |
| { |
| enum dir_state state = DDEFAULT; |
| Py_ssize_t i = 0, destlen = 0; |
| |
| while (i < len) { |
| switch (state) { |
| case DDOT: |
| switch (src[i]) { |
| case 'd': |
| case 'i': |
| state = DHGDI; |
| charcopy(dest, &destlen, destsize, src[i++]); |
| break; |
| case 'h': |
| state = DH; |
| charcopy(dest, &destlen, destsize, src[i++]); |
| break; |
| default: |
| state = DDEFAULT; |
| break; |
| } |
| break; |
| case DH: |
| if (src[i] == 'g') { |
| state = DHGDI; |
| charcopy(dest, &destlen, destsize, src[i++]); |
| } |
| else state = DDEFAULT; |
| break; |
| case DHGDI: |
| if (src[i] == '/') { |
| memcopy(dest, &destlen, destsize, ".hg", 3); |
| charcopy(dest, &destlen, destsize, src[i++]); |
| } |
| state = DDEFAULT; |
| break; |
| case DDEFAULT: |
| if (src[i] == '.') |
| state = DDOT; |
| charcopy(dest, &destlen, destsize, src[i++]); |
| break; |
| } |
| } |
| |
| return destlen; |
| } |
| |
| PyObject *encodedir(PyObject *self, PyObject *args) |
| { |
| Py_ssize_t len, newlen; |
| PyObject *pathobj, *newobj; |
| char *path; |
| |
| if (!PyArg_ParseTuple(args, "O:encodedir", &pathobj)) |
| return NULL; |
| |
| if (PyString_AsStringAndSize(pathobj, &path, &len) == -1) { |
| PyErr_SetString(PyExc_TypeError, "expected a string"); |
| return NULL; |
| } |
| |
| newlen = len ? _encodedir(NULL, 0, path, len + 1) : 1; |
| |
| if (newlen == len + 1) { |
| Py_INCREF(pathobj); |
| return pathobj; |
| } |
| |
| newobj = PyString_FromStringAndSize(NULL, newlen); |
| |
| if (newobj) { |
| PyString_GET_SIZE(newobj)--; |
| _encodedir(PyString_AS_STRING(newobj), newlen, path, |
| len + 1); |
| } |
| |
| return newobj; |
| } |
| |
| static Py_ssize_t _encode(const uint32_t twobytes[8], const uint32_t onebyte[8], |
| char *dest, Py_ssize_t destlen, size_t destsize, |
| const char *src, Py_ssize_t len, |
| int encodedir) |
| { |
| enum path_state state = START; |
| Py_ssize_t i = 0; |
| |
| /* |
| * Python strings end with a zero byte, which we use as a |
| * terminal token as they are not valid inside path names. |
| */ |
| |
| while (i < len) { |
| switch (state) { |
| case START: |
| switch (src[i]) { |
| case '/': |
| charcopy(dest, &destlen, destsize, src[i++]); |
| break; |
| case '.': |
| state = LDOT; |
| escape3(dest, &destlen, destsize, src[i++]); |
| break; |
| case ' ': |
| state = DEFAULT; |
| escape3(dest, &destlen, destsize, src[i++]); |
| break; |
| case 'a': |
| state = A; |
| charcopy(dest, &destlen, destsize, src[i++]); |
| break; |
| case 'c': |
| state = C; |
| charcopy(dest, &destlen, destsize, src[i++]); |
| break; |
| case 'l': |
| state = L; |
| charcopy(dest, &destlen, destsize, src[i++]); |
| break; |
| case 'n': |
| state = N; |
| charcopy(dest, &destlen, destsize, src[i++]); |
| break; |
| case 'p': |
| state = P; |
| charcopy(dest, &destlen, destsize, src[i++]); |
| break; |
| default: |
| state = DEFAULT; |
| break; |
| } |
| break; |
| case A: |
| if (src[i] == 'u') { |
| state = AU; |
| charcopy(dest, &destlen, destsize, src[i++]); |
| } |
| else state = DEFAULT; |
| break; |
| case AU: |
| if (src[i] == 'x') { |
| state = THIRD; |
| i++; |
| } |
| else state = DEFAULT; |
| break; |
| case THIRD: |
| state = DEFAULT; |
| switch (src[i]) { |
| case '.': |
| case '/': |
| case '\0': |
| escape3(dest, &destlen, destsize, src[i - 1]); |
| break; |
| default: |
| i--; |
| break; |
| } |
| break; |
| case C: |
| if (src[i] == 'o') { |
| state = CO; |
| charcopy(dest, &destlen, destsize, src[i++]); |
| } |
| else state = DEFAULT; |
| break; |
| case CO: |
| if (src[i] == 'm') { |
| state = COMLPT; |
| i++; |
| } |
| else if (src[i] == 'n') { |
| state = THIRD; |
| i++; |
| } |
| else state = DEFAULT; |
| break; |
| case COMLPT: |
| switch (src[i]) { |
| case '1': case '2': case '3': case '4': case '5': |
| case '6': case '7': case '8': case '9': |
| state = COMLPTn; |
| i++; |
| break; |
| default: |
| state = DEFAULT; |
| charcopy(dest, &destlen, destsize, src[i - 1]); |
| break; |
| } |
| break; |
| case COMLPTn: |
| state = DEFAULT; |
| switch (src[i]) { |
| case '.': |
| case '/': |
| case '\0': |
| escape3(dest, &destlen, destsize, src[i - 2]); |
| charcopy(dest, &destlen, destsize, src[i - 1]); |
| break; |
| default: |
| memcopy(dest, &destlen, destsize, |
| &src[i - 2], 2); |
| break; |
| } |
| break; |
| case L: |
| if (src[i] == 'p') { |
| state = LP; |
| charcopy(dest, &destlen, destsize, src[i++]); |
| } |
| else state = DEFAULT; |
| break; |
| case LP: |
| if (src[i] == 't') { |
| state = COMLPT; |
| i++; |
| } |
| else state = DEFAULT; |
| break; |
| case N: |
| if (src[i] == 'u') { |
| state = NU; |
| charcopy(dest, &destlen, destsize, src[i++]); |
| } |
| else state = DEFAULT; |
| break; |
| case NU: |
| if (src[i] == 'l') { |
| state = THIRD; |
| i++; |
| } |
| else state = DEFAULT; |
| break; |
| case P: |
| if (src[i] == 'r') { |
| state = PR; |
| charcopy(dest, &destlen, destsize, src[i++]); |
| } |
| else state = DEFAULT; |
| break; |
| case PR: |
| if (src[i] == 'n') { |
| state = THIRD; |
| i++; |
| } |
| else state = DEFAULT; |
| break; |
| case LDOT: |
| switch (src[i]) { |
| case 'd': |
| case 'i': |
| state = HGDI; |
| charcopy(dest, &destlen, destsize, src[i++]); |
| break; |
| case 'h': |
| state = H; |
| charcopy(dest, &destlen, destsize, src[i++]); |
| break; |
| default: |
| state = DEFAULT; |
| break; |
| } |
| break; |
| case DOT: |
| switch (src[i]) { |
| case '/': |
| case '\0': |
| state = START; |
| memcopy(dest, &destlen, destsize, "~2e", 3); |
| charcopy(dest, &destlen, destsize, src[i++]); |
| break; |
| case 'd': |
| case 'i': |
| state = HGDI; |
| charcopy(dest, &destlen, destsize, '.'); |
| charcopy(dest, &destlen, destsize, src[i++]); |
| break; |
| case 'h': |
| state = H; |
| memcopy(dest, &destlen, destsize, ".h", 2); |
| i++; |
| break; |
| default: |
| state = DEFAULT; |
| charcopy(dest, &destlen, destsize, '.'); |
| break; |
| } |
| break; |
| case H: |
| if (src[i] == 'g') { |
| state = HGDI; |
| charcopy(dest, &destlen, destsize, src[i++]); |
| } |
| else state = DEFAULT; |
| break; |
| case HGDI: |
| if (src[i] == '/') { |
| state = START; |
| if (encodedir) |
| memcopy(dest, &destlen, destsize, ".hg", |
| 3); |
| charcopy(dest, &destlen, destsize, src[i++]); |
| } |
| else state = DEFAULT; |
| break; |
| case SPACE: |
| switch (src[i]) { |
| case '/': |
| case '\0': |
| state = START; |
| memcopy(dest, &destlen, destsize, "~20", 3); |
| charcopy(dest, &destlen, destsize, src[i++]); |
| break; |
| default: |
| state = DEFAULT; |
| charcopy(dest, &destlen, destsize, ' '); |
| break; |
| } |
| break; |
| case DEFAULT: |
| while (inset(onebyte, src[i])) { |
| charcopy(dest, &destlen, destsize, src[i++]); |
| if (i == len) |
| goto done; |
| } |
| switch (src[i]) { |
| case '.': |
| state = DOT; |
| i++; |
| break; |
| case ' ': |
| state = SPACE; |
| i++; |
| break; |
| case '/': |
| state = START; |
| charcopy(dest, &destlen, destsize, '/'); |
| i++; |
| break; |
| default: |
| if (inset(onebyte, src[i])) { |
| do { |
| charcopy(dest, &destlen, |
| destsize, src[i++]); |
| } while (i < len && |
| inset(onebyte, src[i])); |
| } |
| else if (inset(twobytes, src[i])) { |
| char c = src[i++]; |
| charcopy(dest, &destlen, destsize, '_'); |
| charcopy(dest, &destlen, destsize, |
| c == '_' ? '_' : c + 32); |
| } |
| else |
| escape3(dest, &destlen, destsize, |
| src[i++]); |
| break; |
| } |
| break; |
| } |
| } |
| done: |
| return destlen; |
| } |
| |
| static Py_ssize_t basicencode(char *dest, size_t destsize, |
| const char *src, Py_ssize_t len) |
| { |
| static const uint32_t twobytes[8] = { 0, 0, 0x87fffffe }; |
| |
| static const uint32_t onebyte[8] = { |
| 1, 0x2bff3bfa, 0x68000001, 0x2fffffff, |
| }; |
| |
| Py_ssize_t destlen = 0; |
| |
| return _encode(twobytes, onebyte, dest, destlen, destsize, |
| src, len, 1); |
| } |
| |
| static const Py_ssize_t maxstorepathlen = 120; |
| |
| static Py_ssize_t _lowerencode(char *dest, size_t destsize, |
| const char *src, Py_ssize_t len) |
| { |
| static const uint32_t onebyte[8] = { |
| 1, 0x2bfffbfb, 0xe8000001, 0x2fffffff |
| }; |
| |
| static const uint32_t lower[8] = { 0, 0, 0x7fffffe }; |
| |
| Py_ssize_t i, destlen = 0; |
| |
| for (i = 0; i < len; i++) { |
| if (inset(onebyte, src[i])) |
| charcopy(dest, &destlen, destsize, src[i]); |
| else if (inset(lower, src[i])) |
| charcopy(dest, &destlen, destsize, src[i] + 32); |
| else |
| escape3(dest, &destlen, destsize, src[i]); |
| } |
| |
| return destlen; |
| } |
| |
| PyObject *lowerencode(PyObject *self, PyObject *args) |
| { |
| char *path; |
| Py_ssize_t len, newlen; |
| PyObject *ret; |
| |
| if (!PyArg_ParseTuple(args, "s#:lowerencode", &path, &len)) |
| return NULL; |
| |
| newlen = _lowerencode(NULL, 0, path, len); |
| ret = PyString_FromStringAndSize(NULL, newlen); |
| if (ret) |
| newlen = _lowerencode(PyString_AS_STRING(ret), newlen, |
| path, len); |
| |
| return ret; |
| } |
| |
| /* See store.py:_auxencode for a description. */ |
| static Py_ssize_t auxencode(char *dest, size_t destsize, |
| const char *src, Py_ssize_t len) |
| { |
| static const uint32_t twobytes[8]; |
| |
| static const uint32_t onebyte[8] = { |
| ~0, 0xffff3ffe, ~0, ~0, ~0, ~0, ~0, ~0, |
| }; |
| |
| return _encode(twobytes, onebyte, dest, 0, destsize, src, len, 0); |
| } |
| |
| static PyObject *hashmangle(const char *src, Py_ssize_t len, const char sha[20]) |
| { |
| static const Py_ssize_t dirprefixlen = 8; |
| static const Py_ssize_t maxshortdirslen = 68; |
| char *dest; |
| PyObject *ret; |
| |
| Py_ssize_t i, d, p, lastslash = len - 1, lastdot = -1; |
| Py_ssize_t destsize, destlen = 0, slop, used; |
| |
| while (lastslash >= 0 && src[lastslash] != '/') { |
| if (src[lastslash] == '.' && lastdot == -1) |
| lastdot = lastslash; |
| lastslash--; |
| } |
| |
| #if 0 |
| /* All paths should end in a suffix of ".i" or ".d". |
| Unfortunately, the file names in test-hybridencode.py |
| violate this rule. */ |
| if (lastdot != len - 3) { |
| PyErr_SetString(PyExc_ValueError, |
| "suffix missing or wrong length"); |
| return NULL; |
| } |
| #endif |
| |
| /* If src contains a suffix, we will append it to the end of |
| the new string, so make room. */ |
| destsize = 120; |
| if (lastdot >= 0) |
| destsize += len - lastdot - 1; |
| |
| ret = PyString_FromStringAndSize(NULL, destsize); |
| if (ret == NULL) |
| return NULL; |
| |
| dest = PyString_AS_STRING(ret); |
| memcopy(dest, &destlen, destsize, "dh/", 3); |
| |
| /* Copy up to dirprefixlen bytes of each path component, up to |
| a limit of maxshortdirslen bytes. */ |
| for (i = d = p = 0; i < lastslash; i++, p++) { |
| if (src[i] == '/') { |
| char d = dest[destlen - 1]; |
| /* After truncation, a directory name may end |
| in a space or dot, which are unportable. */ |
| if (d == '.' || d == ' ') |
| dest[destlen - 1] = '_'; |
| if (destlen > maxshortdirslen) |
| break; |
| charcopy(dest, &destlen, destsize, src[i]); |
| p = -1; |
| } |
| else if (p < dirprefixlen) |
| charcopy(dest, &destlen, destsize, src[i]); |
| } |
| |
| /* Rewind to just before the last slash copied. */ |
| if (destlen > maxshortdirslen + 3) |
| do { |
| destlen--; |
| } while (destlen > 0 && dest[destlen] != '/'); |
| |
| if (destlen > 3) { |
| if (lastslash > 0) { |
| char d = dest[destlen - 1]; |
| /* The last directory component may be |
| truncated, so make it safe. */ |
| if (d == '.' || d == ' ') |
| dest[destlen - 1] = '_'; |
| } |
| |
| charcopy(dest, &destlen, destsize, '/'); |
| } |
| |
| /* Add a prefix of the original file's name. Its length |
| depends on the number of bytes left after accounting for |
| hash and suffix. */ |
| used = destlen + 40; |
| if (lastdot >= 0) |
| used += len - lastdot - 1; |
| slop = maxstorepathlen - used; |
| if (slop > 0) { |
| Py_ssize_t basenamelen = |
| lastslash >= 0 ? len - lastslash - 2 : len - 1; |
| |
| if (basenamelen > slop) |
| basenamelen = slop; |
| if (basenamelen > 0) |
| memcopy(dest, &destlen, destsize, &src[lastslash + 1], |
| basenamelen); |
| } |
| |
| /* Add hash and suffix. */ |
| for (i = 0; i < 20; i++) |
| hexencode(dest, &destlen, destsize, sha[i]); |
| |
| if (lastdot >= 0) |
| memcopy(dest, &destlen, destsize, &src[lastdot], |
| len - lastdot - 1); |
| |
| PyString_GET_SIZE(ret) = destlen; |
| |
| return ret; |
| } |
| |
| /* |
| * Avoiding a trip through Python would improve performance by 50%, |
| * but we don't encounter enough long names to be worth the code. |
| */ |
| static int sha1hash(char hash[20], const char *str, Py_ssize_t len) |
| { |
| static PyObject *shafunc; |
| PyObject *shaobj, *hashobj; |
| |
| if (shafunc == NULL) { |
| PyObject *util, *name = PyString_FromString("mercurial.util"); |
| |
| if (name == NULL) |
| return -1; |
| |
| util = PyImport_Import(name); |
| Py_DECREF(name); |
| |
| if (util == NULL) { |
| PyErr_SetString(PyExc_ImportError, "mercurial.util"); |
| return -1; |
| } |
| shafunc = PyObject_GetAttrString(util, "sha1"); |
| Py_DECREF(util); |
| |
| if (shafunc == NULL) { |
| PyErr_SetString(PyExc_AttributeError, |
| "module 'mercurial.util' has no " |
| "attribute 'sha1'"); |
| return -1; |
| } |
| } |
| |
| shaobj = PyObject_CallFunction(shafunc, "s#", str, len); |
| |
| if (shaobj == NULL) |
| return -1; |
| |
| hashobj = PyObject_CallMethod(shaobj, "digest", ""); |
| Py_DECREF(shaobj); |
| |
| if (!PyString_Check(hashobj) || PyString_GET_SIZE(hashobj) != 20) { |
| PyErr_SetString(PyExc_TypeError, |
| "result of digest is not a 20-byte hash"); |
| Py_DECREF(hashobj); |
| return -1; |
| } |
| |
| memcpy(hash, PyString_AS_STRING(hashobj), 20); |
| Py_DECREF(hashobj); |
| return 0; |
| } |
| |
| #define MAXENCODE 4096 * 4 |
| |
| static PyObject *hashencode(const char *src, Py_ssize_t len) |
| { |
| char dired[MAXENCODE]; |
| char lowered[MAXENCODE]; |
| char auxed[MAXENCODE]; |
| Py_ssize_t dirlen, lowerlen, auxlen, baselen; |
| char sha[20]; |
| |
| baselen = (len - 5) * 3; |
| if (baselen >= MAXENCODE) { |
| PyErr_SetString(PyExc_ValueError, "string too long"); |
| return NULL; |
| } |
| |
| dirlen = _encodedir(dired, baselen, src, len); |
| if (sha1hash(sha, dired, dirlen - 1) == -1) |
| return NULL; |
| lowerlen = _lowerencode(lowered, baselen, dired + 5, dirlen - 5); |
| auxlen = auxencode(auxed, baselen, lowered, lowerlen); |
| return hashmangle(auxed, auxlen, sha); |
| } |
| |
| PyObject *pathencode(PyObject *self, PyObject *args) |
| { |
| Py_ssize_t len, newlen; |
| PyObject *pathobj, *newobj; |
| char *path; |
| |
| if (!PyArg_ParseTuple(args, "O:pathencode", &pathobj)) |
| return NULL; |
| |
| if (PyString_AsStringAndSize(pathobj, &path, &len) == -1) { |
| PyErr_SetString(PyExc_TypeError, "expected a string"); |
| return NULL; |
| } |
| |
| if (len > maxstorepathlen) |
| newlen = maxstorepathlen + 2; |
| else |
| newlen = len ? basicencode(NULL, 0, path, len + 1) : 1; |
| |
| if (newlen <= maxstorepathlen + 1) { |
| if (newlen == len + 1) { |
| Py_INCREF(pathobj); |
| return pathobj; |
| } |
| |
| newobj = PyString_FromStringAndSize(NULL, newlen); |
| |
| if (newobj) { |
| PyString_GET_SIZE(newobj)--; |
| basicencode(PyString_AS_STRING(newobj), newlen, path, |
| len + 1); |
| } |
| } |
| else |
| newobj = hashencode(path, len + 1); |
| |
| return newobj; |
| } |