#ifndef __VIENNA_RNA_PACKAGE_PART_FUNC_H__ #define __VIENNA_RNA_PACKAGE_PART_FUNC_H__ #include "data_structures.h" #ifdef __GNUC__ #define DEPRECATED(func) func __attribute__ ((deprecated)) #else #define DEPRECATED(func) func #endif /** * \addtogroup pf_fold * \brief This section provides information about all functions and variables related to * the calculation of the partition function and base pair probabilities. * * Instead of the minimum free energy structure the partition function of all possible structures * and from that the pairing probability for every possible pair can be calculated, using a dynamic * programming algorithm as described in \cite mccaskill:1990. * * @{ * \file part_func.h * \brief Partition function of single RNA sequences * * This file includes (almost) all function declarations within the RNAlib that are related to * Partion function folding... * @} */ /** * \brief Flag indicating that auxilary arrays are needed throughout the computations. This is essential for stochastic backtracking * * Set this variable to 1 prior to a call of pf_fold() to ensure that all matrices needed for stochastic backtracking * are filled in the forward recursions * * \ingroup subopt_stochbt * * \see pbacktrack(), pbacktrack_circ */ extern int st_back; /* ################################################# # PARTITION FUNCTION COMPUTATION # ################################################# */ /** * \brief Compute the partition function \f$Q\f$ for a given RNA sequence * * If \a structure is not a NULL pointer on input, it contains on * return a string consisting of the letters " . , | { } ( ) " denoting * bases that are essentially unpaired, weakly paired, strongly paired without * preference, weakly upstream (downstream) paired, or strongly up- * (down-)stream paired bases, respectively. * If #fold_constrained is not 0, the \a structure string is * interpreted on input as a list of constraints for the folding. The * character "x" marks bases that must be unpaired, matching brackets " ( ) " * denote base pairs, all other characters are ignored. Any pairs * conflicting with the constraint will be forbidden. This is usually sufficient * to ensure the constraints are honored. * If tha parameter calculate_bppm is set to 0 base pairing probabilities will not * be computed (saving CPU time), otherwise after calculations took place #pr will * contain the probability that bases \a i and \a j pair. * * \ingroup pf_fold * * \note The global array #pr is deprecated and the user who wants the calculated * base pair probabilities for further computations is advised to use the function * export_bppm() * \post After successful run the hidden folding matrices are filled with the appropriate Boltzmann factors. * Depending on whether the global variable #do_backtrack was set the base pair probabilities are already * computed and may be accessed for further usage via the export_bppm() function. * A call of free_pf_arrays() will free all memory allocated by this function. * Successive calls will first free previously allocated memory before starting the computation. * \see pf_fold(), pf_circ_fold(), bppm_to_structure(), export_bppm(), get_boltzmann_factors(), free_pf_arrays() * \param[in] sequence The RNA sequence input * \param[in,out] structure A pointer to a char array where a base pair probability information can be stored in a * pseudo-dot-bracket notation (may be NULL, too) * \param[in] parameters Data structure containing the precalculated Boltzmann factors * \param[in] calculate_bppm Switch to Base pair probability calculations on/off (0==off) * \param[in] is_constrained Switch to indicate that a structure contraint is passed via the structure argument (0==off) * \param[in] is_circular Switch to (de-)activate postprocessing steps in case RNA sequence is circular (0==off) * \return The Gibbs free energy of the ensemble (\f$G = -RT \cdot \log(Q) \f$) in kcal/mol */ float pf_fold_par( const char *sequence, char *structure, pf_paramT *parameters, int calculate_bppm, int is_constrained, int is_circular); /** * \brief Compute the partition function \f$Q\f$ of an RNA sequence * * If \a structure is not a NULL pointer on input, it contains on * return a string consisting of the letters " . , | { } ( ) " denoting * bases that are essentially unpaired, weakly paired, strongly paired without * preference, weakly upstream (downstream) paired, or strongly up- * (down-)stream paired bases, respectively. * If #fold_constrained is not 0, the \a structure string is * interpreted on input as a list of constraints for the folding. The * character "x" marks bases that must be unpaired, matching brackets " ( ) " * denote base pairs, all other characters are ignored. Any pairs * conflicting with the constraint will be forbidden. This is usually sufficient * to ensure the constraints are honored. * If #do_backtrack has been set to 0 base pairing probabilities will not * be computed (saving CPU time), otherwise #pr will contain the probability * that bases \a i and \a j pair. * * \ingroup pf_fold * * \note The global array #pr is deprecated and the user who wants the calculated * base pair probabilities for further computations is advised to use the function * export_bppm(). * \note \b OpenMP: * This function is not entirely threadsafe. While the recursions are working on their * own copies of data the model details for the recursions are determined from the global * settings just before entering the recursions. Consider using pf_fold_par() for a * really threadsafe implementation. * \pre This function takes its model details from the global variables provided in \e RNAlib * \post After successful run the hidden folding matrices are filled with the appropriate Boltzmann factors. * Depending on whether the global variable #do_backtrack was set the base pair probabilities are already * computed and may be accessed for further usage via the export_bppm() function. * A call of free_pf_arrays() will free all memory allocated by this function. * Successive calls will first free previously allocated memory before starting the computation. * \see pf_fold_par(), pf_circ_fold(), bppm_to_structure(), export_bppm() * \param sequence The RNA sequence input * \param structure A pointer to a char array where a base pair probability information can be stored in a pseudo-dot-bracket notation (may be NULL, too) * \return The Gibbs free energy of the ensemble (\f$G = -RT \cdot \log(Q) \f$) in kcal/mol */ float pf_fold(const char *sequence, char *structure); /** * \brief Compute the partition function of a circular RNA sequence * * \ingroup pf_fold * * \note The global array #pr is deprecated and the user who wants the calculated * base pair probabilities for further computations is advised to use the function * export_bppm(). * \note \b OpenMP: * This function is not entirely threadsafe. While the recursions are working on their * own copies of data the model details for the recursions are determined from the global * settings just before entering the recursions. Consider using pf_fold_par() for a * really threadsafe implementation. * \pre This function takes its model details from the global variables provided in \e RNAlib * \post After successful run the hidden folding matrices are filled with the appropriate Boltzmann factors. * Depending on whether the global variable #do_backtrack was set the base pair probabilities are already * computed and may be accessed for further usage via the export_bppm() function. * A call of free_pf_arrays() will free all memory allocated by this function. * Successive calls will first free previously allocated memory before starting the computation. * \see pf_fold_par(), pf_fold() * \param[in] sequence The RNA sequence input * \param[in,out] structure A pointer to a char array where a base pair probability information can be * stored in a pseudo-dot-bracket notation (may be NULL, too) * \return The Gibbs free energy of the ensemble (\f$G = -RT \cdot \log(Q) \f$) in kcal/mol */ float pf_circ_fold( const char *sequence, char *structure); /** * \brief Sample a secondary structure from the Boltzmann ensemble according its probability\n * * \ingroup subopt_stochbt * \pre pf_fold_par() or pf_fold() have to be called first to fill the partition function matrices * * \param sequence The RNA sequence * \return A sampled secondary structure in dot-bracket notation */ char *pbacktrack(char *sequence); /** * \brief Sample a secondary structure of a circular RNA from the Boltzmann ensemble according its probability * * This function does the same as \ref pbacktrack() but assumes the RNA molecule to be circular * * \ingroup subopt_stochbt * \pre pf_fold_par() or pf_fold_circ() have to be called first to fill the partition function matrices * * \param sequence The RNA sequence * \return A sampled secondary structure in dot-bracket notation */ char *pbacktrack_circ(char *sequence); /** * \brief Free arrays for the partition function recursions * * Call this function if you want to free all allocated memory associated with * the partition function forward recursion. * \note Successive calls of pf_fold(), pf_circ_fold() already check if they should free * any memory from a previous run. * \note OpenMP notice:
* This function should be called before leaving a thread in order to avoid leaking memory * * \ingroup pf_fold * * \post All memory allocated by pf_fold_par(), pf_fold() or pf_circ_fold() will be free'd * \see pf_fold_par(), pf_fold(), pf_circ_fold() */ void free_pf_arrays(void); /** * \brief Recalculate energy parameters * * Call this function to recalculate the pair matrix and energy parameters * after a change in folding parameters like #temperature * * \ingroup pf_fold * */ void update_pf_params(int length); /** * \brief Recalculate energy parameters * * \ingroup pf_fold * */ void update_pf_params_par(int length, pf_paramT *parameters); /** * \brief Get a pointer to the base pair probability array * \ingroup pf_fold * * Accessing the base pair probabilities for a pair (i,j) is achieved by * \code * FLT_OR_DBL *pr = export_bppm(); * pr_ij = pr[iindx[i]-j]; * \endcode * * \pre Call pf_fold_par(), pf_fold() or pf_circ_fold() first to fill the base pair probability array * * \see pf_fold(), pf_circ_fold(), get_iindx() * * \return A pointer to the base pair probability array */ FLT_OR_DBL *export_bppm(void); /* ################################################# # OTHER PARTITION FUNCTION RELATED DECLARATIONS # ################################################# */ /** * \brief Create a plist from a probability matrix * * The probability matrix given is parsed and all pair probabilities above * the given threshold are used to create an entry in the plist * * The end of the plist is marked by sequence positions i as well as j * equal to 0. This condition should be used to stop looping over its * entries * * \note This function is threadsafe * \ingroup pf_fold * \param[out] pl A pointer to the plist that is to be created * \param[in] probs The probability matrix used for creting the plist * \param[in] length The length of the RNA sequence * \param[in] cutoff The cutoff value */ void assign_plist_from_pr( plist **pl, FLT_OR_DBL *probs, int length, double cutoff); /* this doesn't work if free_pf_arrays() is called before */ void assign_plist_gquad_from_pr(plist **pl, int length, double cut_off); char *get_centroid_struct_gquad_pr(int length, double *dist); /** * \brief Get the pointers to (almost) all relavant computation arrays used in partition function computation * * \ingroup pf_fold * \pre In order to assign meaningful pointers, you have to call pf_fold_par() or pf_fold() first! * \see pf_fold_par(), pf_fold(), pf_circ_fold() * \param[out] S_p A pointer to the 'S' array (integer representation of nucleotides) * \param[out] S1_p A pointer to the 'S1' array (2nd integer representation of nucleotides) * \param[out] ptype_p A pointer to the pair type matrix * \param[out] qb_p A pointer to the QB matrix * \param[out] qm_p A pointer to the QM matrix * \param[out] q1k_p A pointer to the 5' slice of the Q matrix (\f$q1k(k) = Q(1, k)\f$) * \param[out] qln_p A pointer to the 3' slice of the Q matrix (\f$qln(l) = Q(l, n)\f$) * \return Non Zero if everything went fine, 0 otherwise */ int get_pf_arrays(short **S_p, short **S1_p, char **ptype_p, FLT_OR_DBL **qb_p, FLT_OR_DBL **qm_p, FLT_OR_DBL **q1k_p, FLT_OR_DBL **qln_p); /** * \brief Get the free energy of a subsequence from the q[] array */ double get_subseq_F(int i, int j); /** * \brief Get the centroid structure of the ensemble * * This function is a threadsafe replacement for \ref centroid() with a 'plist' input * * The centroid is the structure with the minimal average distance to all other structures * \n \f$ = \sum_{(i,j) \in S} (1-p_{ij}) + \sum_{(i,j) \notin S} p_{ij} \f$ \n * Thus, the centroid is simply the structure containing all pairs with \f$p_ij>0.5\f$ * The distance of the centroid to the ensemble is written to the memory adressed by \a dist. * * \ingroup centroid_fold * \param[in] length The length of the sequence * \param[out] dist A pointer to the distance variable where the centroid distance will be written to * \param[in] pl A pair list containing base pair probability information about the ensemble * \return The centroid structure of the ensemble in dot-bracket notation */ char *get_centroid_struct_pl(int length, double *dist, plist *pl); /** * \brief Get the centroid structure of the ensemble * * This function is a threadsafe replacement for \ref centroid() with a probability array input * * The centroid is the structure with the minimal average distance to all other structures * \n \f$ = \sum_{(i,j) \in S} (1-p_{ij}) + \sum_{(i,j) \notin S} p_{ij} \f$ \n * Thus, the centroid is simply the structure containing all pairs with \f$p_ij>0.5\f$ * The distance of the centroid to the ensemble is written to the memory adressed by \a dist. * * \ingroup centroid_fold * \param[in] length The length of the sequence * \param[out] dist A pointer to the distance variable where the centroid distance will be written to * \param[in] pr A upper triangular matrix containing base pair probabilities (access via iindx \ref get_iindx() ) * \return The centroid structure of the ensemble in dot-bracket notation */ char *get_centroid_struct_pr(int length, double *dist, FLT_OR_DBL *pr); /** * \brief Get the mean base pair distance of the last partition function computation * * \note To ensure thread-safety, use the function mean_bp_distance_pr() instead! * * \ingroup pf_fold * * \see mean_bp_distance_pr() * * \param length * \return mean base pair distance in thermodynamic ensemble */ double mean_bp_distance(int length); /** * \brief Get the mean base pair distance in the thermodynamic ensemble * * This is a threadsafe implementation of \ref mean_bp_dist() ! * * \f$ = \sum_{a,b} p_a p_b d(S_a,S_b)\f$\n * this can be computed from the pair probs \f$p_ij\f$ as\n * \f$ = \sum_{ij} p_{ij}(1-p_{ij})\f$ * * \note This function is threadsafe * * \ingroup pf_fold * * \param length The length of the sequence * \param pr The matrix containing the base pair probabilities * \return The mean pair distance of the structure ensemble */ double mean_bp_distance_pr(int length, FLT_OR_DBL *pr); /** * \brief Create a dot-bracket like structure string from base pair probability matrix */ void bppm_to_structure(char *structure, FLT_OR_DBL *pr, unsigned int length); plist *stackProb(double cutoff); /** * \brief Get a pseudo dot bracket notation for a given probability information */ char bppm_symbol(const float *x); /* ################################################# # DEPRECATED FUNCTIONS # ################################################# */ /** * \brief Allocate space for pf_fold() * * \deprecated This function is obsolete and will be removed soon! */ DEPRECATED(void init_pf_fold(int length)); /** * \deprecated This function is deprecated and should not be used anymore as it is not threadsafe! * \see get_centroid_struct_pl(), get_centroid_struct_pr() */ DEPRECATED(char *centroid(int length, double *dist)); /* mean pair distance of ensemble */ /** * get the mean pair distance of ensemble * * \deprecated This function is not threadsafe and should not be used anymore. Use \ref mean_bp_distance() instead! */ DEPRECATED(double mean_bp_dist(int length)); /** * \deprecated Use \ref exp_E_IntLoop() from loop_energies.h instead */ DEPRECATED(double expLoopEnergy(int u1, int u2, int type, int type2, short si1, short sj1, short sp1, short sq1)); /** * \deprecated Use exp_E_Hairpin() from loop_energies.h instead */ DEPRECATED(double expHairpinEnergy( int u, int type, short si1, short sj1, const char *string)); #endif