2011-09-08 15:05:02 +00:00
/* Copyright 2010-2011 Florent Le Coz <louiz@louiz.org> */
/* This file is part of Poezio. */
/* Poezio is free software: you can redistribute it and/or modify */
2011-09-11 15:10:05 +00:00
/* it under the terms of the zlib license. See the COPYING file. */
2011-09-08 15:05:02 +00:00
/** The poopt python3 module
* */
/* This file is a python3 module for poezio, used to replace some time-critical
2011-09-14 11:59:25 +00:00
python functions that are too slow . */
2011-09-08 15:05:02 +00:00
# define PY_SSIZE_T_CLEAN
# include "Python.h"
PyObject * ErrorObject ;
2013-06-24 23:18:28 +00:00
/***
Internal functions
* * */
2013-06-24 23:08:16 +00:00
/**
2013-06-24 23:18:28 +00:00
Just checking if the return value is - 1. In some ( all ? ) implementations ,
2013-06-27 13:54:19 +00:00
wcwidth ( " 😆 " ) returns - 1 while it should return 1. In these cases , we
return 1 instead because this is by far the most probable real value .
Since the string is received from python , and the unicode character is
extracted with mbrtowc ( ) , and supposing these two compononents are not
bugged , and since poezio ’ s code should never pass ' \t ' , ' \n ' or their
friends , a return value of - 1 from wcwidth ( ) is considered to be a bug in
wcwidth ( ) ( until proven otherwise ) . xwcwidth ( ) is here to work around
this bug . */
2013-06-24 23:17:20 +00:00
static int xwcwidth ( wchar_t c )
2013-06-24 23:08:16 +00:00
{
const int res = wcwidth ( c ) ;
2013-07-31 21:56:49 +00:00
if ( res = = - 1 & & c ! = ' \x19 ' )
2013-06-24 23:08:16 +00:00
return 1 ;
return res ;
}
2011-09-08 15:05:02 +00:00
/***
The module functions
* * */
2013-06-19 20:04:46 +00:00
/**
cut_text : takes a string and returns a tuple of int .
2013-06-27 13:54:19 +00:00
Each two int tuple is a line , represented by the ending position it
( where it should be cut ) . Not that this position is calculed using the
position of the python string characters , not just the individual bytes .
For example ,
poopt_cut_text ( " vivent les réfrigérateurs " , 6 ) ;
will return [ ( 0 , 6 ) , ( 7 , 10 ) , ( 11 , 17 ) , ( 17 , 22 ) , ( 22 , 24 ) ] , meaning that
the lines are
2013-06-19 20:04:46 +00:00
" vivent " , " les " , " réfrig " , " érateu " and " rs "
2013-06-27 13:54:19 +00:00
2011-09-08 15:05:02 +00:00
*/
2011-09-14 11:59:25 +00:00
PyDoc_STRVAR ( poopt_cut_text_doc , " cut_text(text, width) \n \n \n Return a list of two-tuple, the first int is the starting position of the line and the second is its end. " ) ;
2011-09-08 15:05:02 +00:00
2013-06-19 20:04:46 +00:00
static PyObject * poopt_cut_text ( PyObject * self , PyObject * args )
2011-09-08 15:05:02 +00:00
{
2013-06-19 20:04:46 +00:00
/* The list of tuples that we return */
PyObject * retlist = PyList_New ( 0 ) ;
2014-04-27 19:33:00 +00:00
/* The temporary name for the tuples */
PyObject * tmp ;
2011-09-08 15:05:02 +00:00
2013-06-19 20:04:46 +00:00
/* Get the python arguments */
2013-06-20 19:56:50 +00:00
const size_t width ;
2013-06-19 20:04:46 +00:00
const char * buffer ;
2013-07-07 18:24:03 +00:00
const Py_ssize_t buffer_len ;
2011-09-08 15:05:02 +00:00
2013-06-20 21:28:40 +00:00
if ( PyArg_ParseTuple ( args , " s#k " , & buffer , & buffer_len , & width ) = = 0 )
2013-06-19 20:04:46 +00:00
return NULL ;
2011-09-08 15:05:02 +00:00
2013-06-19 20:04:46 +00:00
/* Pointer to the end of the string */
2013-06-20 19:56:50 +00:00
const char * const end = buffer + buffer_len ;
2011-09-08 15:05:02 +00:00
2013-06-27 13:54:19 +00:00
/* The position, considering unicode chars (aka, the position in the
2013-06-19 20:04:46 +00:00
* python string ) . This is used to determine the position in the python
* string at which we should cut */
2013-06-20 19:56:50 +00:00
unsigned int spos = 0 ;
2013-06-19 20:04:46 +00:00
/* The start position (in the python-string) of the next line */
2013-06-20 19:56:50 +00:00
unsigned int start_pos = 0 ;
2013-06-19 20:04:46 +00:00
/* The position of the last space seen in the current line. This is used
* to cut on spaces instead of cutting inside words , if possible ( aka if
* there is a space ) */
int last_space = - 1 ;
/* The number of columns taken by chars between start_pos and last_space */
size_t cols_until_space = 0 ;
/* The number of bytes consumed by mbrtowc. We advance the buffer ptr by this value */
size_t consumed ;
/* Number of columns taken to display the current line so far */
size_t columns = 0 ;
2013-06-27 13:54:19 +00:00
/* The unicode character found by mbrtowc */
2013-06-19 20:04:46 +00:00
wchar_t wc ;
while ( buffer < end )
2011-09-08 15:05:02 +00:00
{
2013-06-19 20:04:46 +00:00
/* Special case to jump poezio special characters that are contained
* in the python string , but should not be counted as chars , because
* they will not be displayed . Those are the formatting chars ( to
* insert colors or things like that in the string ) */
if ( * buffer = = 25 ) /* \x19 */
{
/* Jump everything until the end of this format marker, but
* without increasing the number of columns of the current
* line . Because these chars are not printed . */
while ( buffer < end & & * buffer ! = ' u ' & &
* buffer ! = ' a ' & & * buffer ! = ' i ' & &
* buffer ! = ' b ' & & * buffer ! = ' o ' & &
* buffer ! = ' } ' )
{
buffer + + ;
spos + + ;
}
buffer + + ;
spos + + ;
continue ;
}
/* Find the next unicode character (a wchar_t) in the string. This
* may consume from one to 4 bytes . */
consumed = mbrtowc ( & wc , buffer , end - buffer , NULL ) ;
if ( consumed = = 0 )
break ;
else if ( ( size_t ) - 1 = = consumed )
{
PyErr_SetString ( PyExc_UnicodeError ,
" mbrtowc returned -1: Invalid multibyte sequence. " ) ;
return NULL ;
}
else if ( ( size_t ) - 2 = = consumed )
{
PyErr_SetString ( PyExc_UnicodeError ,
" mbrtowc returned -2: Could not parse a complete multibyte character. " ) ;
return NULL ;
}
buffer + = consumed ;
/* This is one condition to end the line: an explicit \n is found */
if ( wc = = ( wchar_t ) ' \n ' )
{
spos + + ;
2014-04-27 19:33:00 +00:00
tmp = Py_BuildValue ( " II " , start_pos , spos ) ;
if ( PyList_Append ( retlist , tmp ) = = - 1 )
{
Py_XDECREF ( tmp ) ;
2013-06-19 20:04:46 +00:00
return NULL ;
2014-04-27 19:33:00 +00:00
}
Py_XDECREF ( tmp ) ;
2013-06-19 20:04:46 +00:00
/* And then initiate a new line */
start_pos = spos ;
last_space = - 1 ;
columns = 0 ;
continue ;
}
2013-06-24 23:08:16 +00:00
/* Get the number of columns needed to display this character. May be 0, 1 or 2 */
const size_t cols = xwcwidth ( wc ) ;
2013-06-19 20:04:46 +00:00
/* This is the second condition to end the line: we have consumed
2013-06-27 13:54:19 +00:00
* enough columns to fill a whole line */
2013-06-19 20:04:46 +00:00
if ( columns + cols > width )
{ /* If possible, cut on a space */
if ( last_space ! = - 1 )
{
2014-04-27 19:33:00 +00:00
tmp = Py_BuildValue ( " II " , start_pos , last_space ) ;
if ( PyList_Append ( retlist , tmp ) = = - 1 )
{
Py_XDECREF ( tmp ) ;
2013-06-19 20:04:46 +00:00
return NULL ;
2014-04-27 19:33:00 +00:00
}
Py_XDECREF ( tmp ) ;
2013-06-19 20:04:46 +00:00
start_pos = last_space + 1 ;
last_space = - 1 ;
columns - = ( cols_until_space + 1 ) ;
}
else
{
/* Otherwise, cut in the middle of a word */
2014-04-27 19:33:00 +00:00
tmp = Py_BuildValue ( " II " , start_pos , spos ) ;
if ( PyList_Append ( retlist , tmp ) = = - 1 )
{
Py_XDECREF ( tmp ) ;
2013-06-19 20:04:46 +00:00
return NULL ;
2014-04-27 19:33:00 +00:00
}
Py_XDECREF ( tmp ) ;
2013-06-19 20:04:46 +00:00
start_pos = spos ;
columns = 0 ;
}
}
/* We save the position of the last space seen in this line, and the
number of columns we have until now . This helps us keep track of
the columns to count when we will use that space as a cutting
point , later */
if ( wc = = ( wchar_t ) ' ' )
{
last_space = spos ;
cols_until_space = columns ;
}
/* We advanced from one char, increment spos by one and add the
* char ' s columns to the line ' s columns */
columns + = cols ;
spos + + ;
2011-09-08 15:05:02 +00:00
}
2013-06-19 20:04:46 +00:00
/* We are at the end of the string, append the last line, not finished */
2014-04-27 19:33:00 +00:00
tmp = Py_BuildValue ( " II " , start_pos , spos ) ;
if ( PyList_Append ( retlist , tmp ) = = - 1 )
{
Py_XDECREF ( tmp ) ;
2013-06-19 20:04:46 +00:00
return NULL ;
2014-04-27 19:33:00 +00:00
}
Py_XDECREF ( tmp ) ;
2013-06-19 20:04:46 +00:00
return retlist ;
2011-09-08 15:05:02 +00:00
}
2013-06-19 20:06:51 +00:00
/**
2013-06-27 13:54:19 +00:00
wcswidth : An emulation of the POSIX wcswidth ( 3 ) function using wcwidth
and mbrtowc .
2013-06-19 20:06:51 +00:00
*/
PyDoc_STRVAR ( poopt_wcswidth_doc , " wcswidth(s) \n \n \n The wcswidth() function returns the number of columns needed to represent the wide-character string pointed to by s. Raise UnicodeError if an invalid unicode value is passed " ) ;
static PyObject * poopt_wcswidth ( PyObject * self , PyObject * args )
{
const char * string ;
2013-07-07 18:24:03 +00:00
const Py_ssize_t len ;
2013-06-19 20:06:51 +00:00
if ( PyArg_ParseTuple ( args , " s# " , & string , & len ) = = 0 )
return NULL ;
2013-06-20 19:56:50 +00:00
const char * const end = string + len ;
2013-06-19 20:06:51 +00:00
wchar_t wc ;
int res = 0 ;
while ( string < end )
{
2013-06-20 19:56:50 +00:00
const size_t consumed = mbrtowc ( & wc , string , end - string , NULL ) ;
2013-06-19 20:06:51 +00:00
if ( consumed = = 0 )
break ;
else if ( ( size_t ) - 1 = = consumed )
{
PyErr_SetString ( PyExc_UnicodeError ,
" mbrtowc returned -1: Invalid multibyte sequence. " ) ;
return NULL ;
}
else if ( ( size_t ) - 2 = = consumed )
{
PyErr_SetString ( PyExc_UnicodeError ,
" mbrtowc returned -2: Could not parse a complete multibyte character. " ) ;
return NULL ;
}
string + = consumed ;
2013-06-24 23:08:16 +00:00
res + = xwcwidth ( wc ) ;
2013-06-19 20:06:51 +00:00
}
return Py_BuildValue ( " i " , res ) ;
}
2013-06-20 20:17:43 +00:00
/**
cut_by_columns : takes a python string and a number of columns , returns a
python string truncated to take at most that many columns
For example cut_by_columns ( n , " エメルカ " ) will return :
2013-06-27 13:54:19 +00:00
- n = = 5 - > " エメ " ( which takes only 4 columns since we can ' t cut the
next character in half )
2013-06-20 20:17:43 +00:00
- n = = 2 - > " エ "
- n = = 1 - > " "
- n = = 42 - > " エメルカ "
- etc
*/
PyDoc_STRVAR ( poopt_cut_by_columns_doc , " cut_by_columns(n, string) \n \n \n returns a string truncated to take at most n columns " ) ;
static PyObject * poopt_cut_by_columns ( PyObject * self , PyObject * args )
{
const char * start ;
2013-07-07 18:24:03 +00:00
const Py_ssize_t len ;
2013-06-20 20:17:43 +00:00
const size_t limit ;
2013-06-20 21:28:40 +00:00
if ( PyArg_ParseTuple ( args , " s#k " , & start , & len , & limit ) = = 0 )
2013-06-20 20:17:43 +00:00
return NULL ;
const char * const end = start + len ;
const char * ptr = start ;
wchar_t wc ;
/* The number of columns that the string would take so far */
size_t columns = 0 ;
while ( ptr < end )
{
2013-06-20 21:25:53 +00:00
if ( columns = = limit )
break ;
2013-06-20 20:17:43 +00:00
const size_t consumed = mbrtowc ( & wc , ptr , end - ptr , NULL ) ;
if ( consumed = = 0 )
break ;
else if ( ( size_t ) - 1 = = consumed )
{
PyErr_SetString ( PyExc_UnicodeError ,
" mbrtowc returned -1: Invalid multibyte sequence. " ) ;
return NULL ;
}
else if ( ( size_t ) - 2 = = consumed )
{
PyErr_SetString ( PyExc_UnicodeError ,
" mbrtowc returned -2: Could not parse a complete multibyte character. " ) ;
return NULL ;
}
const size_t cols = wcwidth ( wc ) ;
if ( columns + cols > limit )
/* Adding the next character would exceed the column limit */
break ;
ptr + = consumed ;
columns + = cols ;
}
return Py_BuildValue ( " s# " , start , ptr - start ) ;
}
2011-09-08 15:05:02 +00:00
/***
2013-06-27 13:54:19 +00:00
Module initialization . Just taken from the xxmodule . c template from the
python sources .
2011-09-08 15:05:02 +00:00
* * */
static PyTypeObject Str_Type = {
PyVarObject_HEAD_INIT ( NULL , 0 )
" pooptmodule.Str " , /*tp_name*/
0 , /*tp_basicsize*/
0 , /*tp_itemsize*/
/* methods */
0 , /*tp_dealloc*/
0 , /*tp_print*/
0 , /*tp_getattr*/
0 , /*tp_setattr*/
0 , /*tp_reserved*/
0 , /*tp_repr*/
0 , /*tp_as_number*/
0 , /*tp_as_sequence*/
0 , /*tp_as_mapping*/
0 , /*tp_hash*/
0 , /*tp_call*/
0 , /*tp_str*/
0 , /*tp_getattro*/
0 , /*tp_setattro*/
0 , /*tp_as_buffer*/
Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE , /*tp_flags*/
0 , /*tp_doc*/
0 , /*tp_traverse*/
0 , /*tp_clear*/
0 , /*tp_richcompare*/
0 , /*tp_weaklistoffset*/
0 , /*tp_iter*/
0 , /*tp_iternext*/
0 , /*tp_methods*/
0 , /*tp_members*/
0 , /*tp_getset*/
0 , /* see PyInit_xx */ /*tp_base*/
0 , /*tp_dict*/
0 , /*tp_descr_get*/
0 , /*tp_descr_set*/
0 , /*tp_dictoffset*/
0 , /*tp_init*/
0 , /*tp_alloc*/
0 , /*tp_new*/
0 , /*tp_free*/
0 , /*tp_is_gc*/
} ;
static PyObject *
null_richcompare ( PyObject * self , PyObject * other , int op )
{
Py_INCREF ( Py_NotImplemented ) ;
return Py_NotImplemented ;
}
static PyTypeObject Null_Type = {
PyVarObject_HEAD_INIT ( NULL , 0 )
" pooptmodule.Null " , /*tp_name*/
0 , /*tp_basicsize*/
0 , /*tp_itemsize*/
/* methods */
0 , /*tp_dealloc*/
0 , /*tp_print*/
0 , /*tp_getattr*/
0 , /*tp_setattr*/
0 , /*tp_reserved*/
0 , /*tp_repr*/
0 , /*tp_as_number*/
0 , /*tp_as_sequence*/
0 , /*tp_as_mapping*/
0 , /*tp_hash*/
0 , /*tp_call*/
0 , /*tp_str*/
0 , /*tp_getattro*/
0 , /*tp_setattro*/
0 , /*tp_as_buffer*/
Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE , /*tp_flags*/
0 , /*tp_doc*/
0 , /*tp_traverse*/
0 , /*tp_clear*/
null_richcompare , /*tp_richcompare*/
0 , /*tp_weaklistoffset*/
0 , /*tp_iter*/
0 , /*tp_iternext*/
0 , /*tp_methods*/
0 , /*tp_members*/
0 , /*tp_getset*/
0 , /* see PyInit_xx */ /*tp_base*/
0 , /*tp_dict*/
0 , /*tp_descr_get*/
0 , /*tp_descr_set*/
0 , /*tp_dictoffset*/
0 , /*tp_init*/
0 , /*tp_alloc*/
0 , /* see PyInit_xx */ /*tp_new*/
0 , /*tp_free*/
0 , /*tp_is_gc*/
} ;
/* List of functions defined in the module */
static PyMethodDef poopt_methods [ ] = {
2013-06-19 20:04:46 +00:00
{ " cut_text " , poopt_cut_text , METH_VARARGS , poopt_cut_text_doc } ,
2013-06-19 20:06:51 +00:00
{ " wcswidth " , poopt_wcswidth , METH_VARARGS , poopt_wcswidth_doc } ,
2013-06-20 20:17:43 +00:00
{ " cut_by_columns " , poopt_cut_by_columns , METH_VARARGS , poopt_cut_by_columns_doc } ,
2013-06-19 20:04:46 +00:00
{ } /* sentinel */
2011-09-08 15:05:02 +00:00
} ;
PyDoc_STRVAR ( module_doc ,
2013-06-19 20:04:46 +00:00
" This is a template module just for instruction. And poopt. " ) ;
2011-09-08 15:05:02 +00:00
/* Initialization function for the module (*must* be called PyInit_xx) */
static struct PyModuleDef pooptmodule = {
PyModuleDef_HEAD_INIT ,
" poopt " ,
module_doc ,
- 1 ,
poopt_methods ,
NULL ,
NULL ,
NULL ,
NULL
} ;
PyMODINIT_FUNC
PyInit_poopt ( void )
{
PyObject * m = NULL ;
/* Due to cross platform compiler issues the slots must be filled
* here . It ' s required for portability to Windows without requiring
* C + + . */
Null_Type . tp_base = & PyBaseObject_Type ;
Null_Type . tp_new = PyType_GenericNew ;
Str_Type . tp_base = & PyUnicode_Type ;
/* Finalize the type object including setting type of the new type
* object ; doing it here is required for portability , too . */
/* if (PyType_Ready(&Xxo_Type) < 0) */
/* goto fail; */
/* Create the module and add the functions */
m = PyModule_Create ( & pooptmodule ) ;
if ( m = = NULL )
goto fail ;
/* Add some symbolic constants to the module */
if ( ErrorObject = = NULL ) {
ErrorObject = PyErr_NewException ( " poopt.error " , NULL , NULL ) ;
if ( ErrorObject = = NULL )
goto fail ;
}
Py_INCREF ( ErrorObject ) ;
PyModule_AddObject ( m , " error " , ErrorObject ) ;
/* Add Str */
if ( PyType_Ready ( & Str_Type ) < 0 )
goto fail ;
PyModule_AddObject ( m , " Str " , ( PyObject * ) & Str_Type ) ;
/* Add Null */
if ( PyType_Ready ( & Null_Type ) < 0 )
goto fail ;
PyModule_AddObject ( m , " Null " , ( PyObject * ) & Null_Type ) ;
return m ;
fail :
Py_XDECREF ( m ) ;
return NULL ;
}