@@ -109,6 +109,109 @@ static Py_ssize_t unicode_aswidechar(PyObject *unicode, wchar_t *w, Py_ssize_t s
109109 }
110110}
111111
112+ #define _PyUnicode_UTF8 (op ) \
113+ (((PyCompactUnicodeObject*)(op))->utf8)
114+ #define _PyUnicode_UTF8_LENGTH (op ) \
115+ (((PyCompactUnicodeObject*)(op))->utf8_length)
116+ #define _PyUnicode_WSTR (op ) \
117+ (((PyASCIIObject*)(op))->wstr)
118+ #define _PyUnicode_WSTR_LENGTH (op ) \
119+ (((PyCompactUnicodeObject*)(op))->wstr_length)
120+ #define _PyUnicode_LENGTH (op ) \
121+ (((PyASCIIObject *)(op))->length)
122+ #define _PyUnicode_STATE (op ) \
123+ (((PyASCIIObject *)(op))->state)
124+ #define _PyUnicode_DATA_ANY (op ) \
125+ (((PyUnicodeObject*)(op))->data.any)
126+
127+ POLYGLOT_DECLARE_TYPE (PyUnicodeObject );
128+
129+ PyUnicodeObject * unicode_subtype_new (PyTypeObject * type , PyObject * unicode ) {
130+ PyObject * self ;
131+ Py_ssize_t length , char_size ;
132+ int share_wstr , share_utf8 ;
133+ unsigned int kind ;
134+ void * data ;
135+
136+ if (unicode == NULL )
137+ return NULL ;
138+ assert (_PyUnicode_CHECK (unicode ));
139+ if (PyUnicode_READY (unicode ) == -1 ) {
140+ Py_DECREF (unicode );
141+ return NULL ;
142+ }
143+
144+ self = type -> tp_alloc (type , 0 );
145+ if (self == NULL ) {
146+ Py_DECREF (unicode );
147+ return NULL ;
148+ }
149+ kind = PyUnicode_KIND (unicode );
150+ length = PyUnicode_GET_LENGTH (unicode );
151+
152+ _PyUnicode_LENGTH (self ) = length ;
153+ _PyUnicode_STATE (self ).interned = 0 ;
154+ _PyUnicode_STATE (self ).kind = kind ;
155+ _PyUnicode_STATE (self ).compact = 0 ;
156+ _PyUnicode_STATE (self ).ascii = _PyUnicode_STATE (unicode ).ascii ;
157+ _PyUnicode_STATE (self ).ready = 1 ;
158+ _PyUnicode_WSTR (self ) = NULL ;
159+ _PyUnicode_UTF8_LENGTH (self ) = 0 ;
160+ _PyUnicode_UTF8 (self ) = NULL ;
161+ _PyUnicode_WSTR_LENGTH (self ) = 0 ;
162+ _PyUnicode_DATA_ANY (self ) = NULL ;
163+
164+ share_utf8 = 0 ;
165+ share_wstr = 0 ;
166+ if (kind == PyUnicode_1BYTE_KIND ) {
167+ char_size = 1 ;
168+ if (PyUnicode_MAX_CHAR_VALUE (unicode ) < 128 )
169+ share_utf8 = 1 ;
170+ }
171+ else if (kind == PyUnicode_2BYTE_KIND ) {
172+ char_size = 2 ;
173+ if (sizeof (wchar_t ) == 2 )
174+ share_wstr = 1 ;
175+ }
176+ else {
177+ assert (kind == PyUnicode_4BYTE_KIND );
178+ char_size = 4 ;
179+ if (sizeof (wchar_t ) == 4 )
180+ share_wstr = 1 ;
181+ }
182+
183+ /* Ensure we won't overflow the length. */
184+ if (length > (PY_SSIZE_T_MAX / char_size - 1 )) {
185+ PyErr_NoMemory ();
186+ // Py_DECREF(unicode);
187+ // Py_DECREF(self);
188+ return NULL ;
189+ }
190+ data = malloc ((length + 1 ) * char_size );
191+ if (data == NULL ) {
192+ PyErr_NoMemory ();
193+ // Py_DECREF(unicode);
194+ // Py_DECREF(self);
195+ return NULL ;
196+ }
197+
198+ _PyUnicode_DATA_ANY (self ) = data ;
199+ if (share_utf8 ) {
200+ _PyUnicode_UTF8_LENGTH (self ) = length ;
201+ _PyUnicode_UTF8 (self ) = data ;
202+ }
203+ if (share_wstr ) {
204+ _PyUnicode_WSTR_LENGTH (self ) = length ;
205+ _PyUnicode_WSTR (self ) = (wchar_t * )data ;
206+ }
207+
208+ memcpy (data , PyUnicode_DATA (unicode ),
209+ kind * (length + 1 ));
210+ assert (_PyUnicode_CheckConsistency (self , 1 ));
211+ Py_DECREF (unicode );
212+ return (PyUnicodeObject * ) polyglot_from_PyUnicodeObject ((PyUnicodeObject * )self );
213+ }
214+
112215PyObject * PyUnicode_FromString (const char * o ) {
113216 return to_sulong (polyglot_from_string (o , SRC_CS ));
114217}
@@ -245,9 +348,8 @@ PyObject* PyUnicode_FromObject(PyObject* o) {
245348 return UPCALL_CEXT_O (_jls_PyUnicode_FromObject , native_to_java (o ));
246349}
247350
248- UPCALL_ID (PyUnicode_GetLength );
249351Py_ssize_t PyUnicode_GetLength (PyObject * unicode ) {
250- return UPCALL_CEXT_L ( _jls_PyUnicode_GetLength , native_to_java ( unicode ) );
352+ return PyUnicode_GET_LENGTH ( unicode );
251353}
252354
253355UPCALL_ID (PyUnicode_Concat );
@@ -305,7 +407,7 @@ PyObject * PyUnicode_DecodeUTF32(const char *s, Py_ssize_t size, const char *err
305407 PyObject * result ;
306408 void * jerrors = errors != NULL ? polyglot_from_string (errors , SRC_CS ) : NULL ;
307409 int bo = byteorder != NULL ? * byteorder : 0 ;
308- return polyglot_invoke (PY_TRUFFLE_CEXT , "PyTruffle_Unicode_DecodeUTF32" , s , size , native_to_java (jerrors ), bo , NULL );
410+ return polyglot_invoke (PY_TRUFFLE_CEXT , "PyTruffle_Unicode_DecodeUTF32" , polyglot_from_i8_array ( s , size ) , size , native_to_java (jerrors ), bo , NULL );
309411}
310412
311413Py_ssize_t PyUnicode_AsWideChar (PyObject * unicode , wchar_t * w , Py_ssize_t size ) {
@@ -525,3 +627,67 @@ UPCALL_ID(PyUnicode_Replace);
525627PyObject * PyUnicode_Replace (PyObject * str , PyObject * substr , PyObject * replstr , Py_ssize_t maxcount ) {
526628 return UPCALL_CEXT_O (_jls_PyUnicode_Replace , native_to_java (str ), native_to_java (substr ), native_to_java (replstr ), maxcount );
527629}
630+
631+ /* Generic helper macro to convert characters of different types.
632+ from_type and to_type have to be valid type names, begin and end
633+ are pointers to the source characters which should be of type
634+ "from_type *". to is a pointer of type "to_type *" and points to the
635+ buffer where the result characters are written to. */
636+ #define _PyUnicode_CONVERT_BYTES (from_type , to_type , begin , end , to ) \
637+ do { \
638+ to_type *_to = (to_type *)(to); \
639+ const from_type *_iter = (from_type *)(begin); \
640+ const from_type *_end = (from_type *)(end); \
641+ Py_ssize_t n = (_end) - (_iter); \
642+ const from_type *_unrolled_end = \
643+ _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
644+ while (_iter < (_unrolled_end)) { \
645+ _to[0] = (to_type) _iter[0]; \
646+ _to[1] = (to_type) _iter[1]; \
647+ _to[2] = (to_type) _iter[2]; \
648+ _to[3] = (to_type) _iter[3]; \
649+ _iter += 4; _to += 4; \
650+ } \
651+ while (_iter < (_end)) \
652+ *_to++ = (to_type) *_iter++; \
653+ } while (0)
654+
655+
656+ POLYGLOT_DECLARE_TYPE (Py_UCS4 );
657+
658+ /* used from Java only to decode a native unicode object */
659+ void * native_unicode_as_string (PyObject * string ) {
660+ Py_UCS4 * target = NULL ;
661+ int kind = 0 ;
662+ void * data = NULL ;
663+ void * result = NULL ;
664+ Py_ssize_t len ;
665+ if (PyUnicode_READY (string ) == -1 ) {
666+ PyErr_Format (PyExc_TypeError , "provided unicode object is not ready" );
667+ return NULL ;
668+ }
669+ kind = PyUnicode_KIND (string );
670+ data = PyUnicode_DATA (string );
671+ len = PyUnicode_GET_LENGTH (string );
672+ if (kind == PyUnicode_1BYTE_KIND ) {
673+ Py_UCS1 * start = (Py_UCS1 * ) data ;
674+ if (PyUnicode_IS_COMPACT_ASCII (string )) {
675+ return polyglot_from_string_n ((const char * )data , sizeof (Py_UCS1 ) * len , "ascii" );
676+ }
677+ return polyglot_from_string_n ((const char * )data , sizeof (Py_UCS1 ) * len , "latin1" );
678+ }
679+ else if (kind == PyUnicode_2BYTE_KIND ) {
680+ Py_UCS2 * start = (Py_UCS2 * ) data ;
681+ target = PyMem_New (Py_UCS4 , len );
682+ if (!target ) {
683+ PyErr_NoMemory ();
684+ return NULL ;
685+ }
686+ _PyUnicode_CONVERT_BYTES (Py_UCS2 , Py_UCS4 , start , start + len , target );
687+ result = polyglot_from_string_n ((const char * )target , sizeof (Py_UCS4 ) * len , "UTF-32" );
688+ free (target );
689+ return result ;
690+ }
691+ assert (kind == PyUnicode_4BYTE_KIND );
692+ return polyglot_from_string_n ((const char * )data , sizeof (Py_UCS4 ) * len , "UTF-32" );
693+ }
0 commit comments